crawl_asin.py 9.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218
  1. import asyncio
  2. import datetime
  3. import json
  4. import os
  5. import sys
  6. import time
  7. import asyncio
  8. import signal
  9. import asyncio
  10. import pickle
  11. from pathlib import Path,PurePath,PurePosixPath
  12. import random
  13. from typing import List
  14. import httpx
  15. import ssl
  16. from sqlmodel import select, Session
  17. from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, CrawlResult
  18. from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
  19. from crawl4ai.extraction_strategy import JsonCssExtractionStrategy,JsonXPathExtractionStrategy,ExtractionStrategy
  20. from crawl4ai.content_filter_strategy import BM25ContentFilter
  21. from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
  22. import base64
  23. from utils.logu import get_logger
  24. from config.settings import OUTPUT_DIR,TEMP_PAGE_DIR
  25. from utils.drission_page import load_chrome_from_ini,ChromeOptions
  26. from utils.file import save_to_file,check_exists,s3,read_file
  27. from config.settings import CFG
  28. from src.browser.crawl_base import CrawlerBase
  29. from upath import UPath
  30. logger = get_logger('browser')
  31. ASIN_HTML_DIR = OUTPUT_DIR / 'page' / 'asin'
  32. ASIN_HTML_DIR.mkdir(parents=True, exist_ok=True)
  33. class Crawler(CrawlerBase):
  34. s3_prefix = f"{CFG.s3_prefix}/output/asinseed/"
  35. def __init__(self, chrome_options:ChromeOptions):
  36. super().__init__(chrome_options)
  37. def get_asin_url(self, asin:str, asin_area:str):
  38. # https://www.asinseed.com/en/JP?q=B0CQ1SHD8V
  39. return f"https://www.asinseed.com/en/{asin_area}?q={asin}"
  40. def get_asin_page_data(self, asin:str, asin_area:str, mthml_type:bool=True):
  41. page = load_chrome_from_ini(
  42. self.chrome_options
  43. )
  44. url = self.get_asin_url(asin, asin_area)
  45. page.get(url)
  46. if mthml_type:
  47. return page.save()
  48. else:
  49. return page.html
  50. def get_asin_and_save_page(self, asin:str, asin_area:str='JP', mthml_type:bool=True, save_path:str=None, overwrite:bool=False):
  51. if not overwrite and check_exists(save_path):
  52. logger.info(f"exists {save_path} ")
  53. return save_path
  54. data = self.get_asin_page_data(asin, asin_area, mthml_type)
  55. save_path = save_path or str(ASIN_HTML_DIR / f'{asin}{".mhtml" if mthml_type else ".html"}')
  56. return save_to_file(data, save_path)
  57. async def extra_result_table(self, html:str, input_schema:dict={}) -> CrawlResult:
  58. schema = input_schema or{
  59. "baseSelector": "table.table tbody tr", # 每行数据对应一个tr
  60. "fields": [
  61. {
  62. "name": "traffic_keyword",
  63. "selector": "td:first-child a", # 关键词文本
  64. "type": "text"
  65. },
  66. {
  67. "name": "keyword_link",
  68. "selector": "td:first-child a", # 关键词超链接
  69. "type": "attribute",
  70. "attribute": "href"
  71. },
  72. {
  73. "name": "monthly_searches",
  74. "selector": "td:nth-child(2) span", # 搜索量数值
  75. "type": "text",
  76. "transform": lambda x: x.replace(",", "") if x else None # 移除逗号转数字
  77. },
  78. {
  79. "name": "search_trend_link",
  80. "selector": "td:nth-child(2) a", # 搜索量趋势链接(带图表)
  81. "type": "attribute",
  82. "attribute": "href"
  83. },
  84. {
  85. "name": "weight",
  86. "selector": "td:nth-child(3) i.leaf", # 统计叶子图标数量
  87. "type": "count" # 通过计数获取权重值
  88. },
  89. {
  90. "name": "amazon_search_link",
  91. "selector": "td:last-child a", # Amazon搜索链接
  92. "type": "attribute",
  93. "attribute": "href"
  94. }
  95. ]
  96. }
  97. result = await self.excra_strategy_raw_html(html, schema, JsonCssExtractionStrategy)
  98. if not result.success:
  99. logger.error(f"Crawl failed: {result.error_message}")
  100. return
  101. data = json.loads(result.extracted_content)
  102. logger.info(f"Extracted {len(data)} coin rows")
  103. logger.debug(f"First item: {result.extracted_content}")
  104. # [{"traffic_keyword":"","keyword_link":"..."}, {}]
  105. return data
  106. async def excra_product_info(self, html:str, input_schema:dict={}, strategy:ExtractionStrategy=JsonXPathExtractionStrategy) -> CrawlResult:
  107. schema = input_schema or {
  108. "name": "Product Details",
  109. "baseSelector": "div.js-sticky-block",
  110. "fields": [
  111. {
  112. "name": "product_info",
  113. "selector": "#div-asin-product-infor",
  114. "type": "nested",
  115. "fields": [
  116. {
  117. "name": "image_url",
  118. "selector": "div.avatar-self-pic img",
  119. "type": "attribute",
  120. "attribute": "src"
  121. },
  122. {
  123. "name": "goto_amazon",
  124. "selector": "a.btn-asinseed-link",
  125. "type": "attribute",
  126. "attribute": "href"
  127. },
  128. {
  129. "name": "main_text",
  130. "selector": "div.media-body h4",
  131. "type": "text",
  132. "transform": ["strip"]
  133. }
  134. ]
  135. },
  136. {
  137. "name": "unique_words",
  138. "selector": "h3:has(+ article) + article span.badge-asinseed-keywords-weight", # 精准定位到目标article
  139. "type": "list",
  140. "fields": [
  141. {"name": "word", "type": "text"} # 保持字段结构
  142. ],
  143. "transform": ["extract_list"] # 添加转换器将对象列表转为纯文本列表
  144. }
  145. ]
  146. }
  147. result:CrawlResult = await self.excra_strategy_raw_html(html, schema, JsonCssExtractionStrategy)
  148. if not result.success:
  149. logger.error(f"Crawl failed: {result.error_message}")
  150. return
  151. data = json.loads(result.extracted_content)
  152. logger.info(f"Extracted {len(data)} coin rows")
  153. logger.debug(f"result.extracted_content: {result.extracted_content}")
  154. data = data[0] if data else {}
  155. excract_unique_words = data.get('unique_words', [])
  156. if excract_unique_words:
  157. data['unique_words'] = [item['word'] for item in excract_unique_words]
  158. return data
  159. def get_mpath_html_content(self, mhtml_path:str):
  160. mhtml_data = read_file(mhtml_path)
  161. mhtml_path_name = PurePath(mhtml_path).name
  162. temp_mhtml_path = save_to_file(mhtml_data, str(TEMP_PAGE_DIR / mhtml_path_name))
  163. self.get(temp_mhtml_path)
  164. html_content = self.page.html
  165. return temp_mhtml_path,html_content
  166. async def extract_product_and_save_resource(self, html_content:str, upload_s3_dir:str=None):
  167. data = await self.excra_product_info(html_content)
  168. if data['product_info'].get('image_url'):
  169. img_name = UPath(data['product_info']['image_url']).name
  170. img_path = str(UPath(upload_s3_dir) / img_name)
  171. logger.info(f"upload_s3_dir {upload_s3_dir}")
  172. status,save_img_path = await asyncio.to_thread(self.download_img,
  173. data['product_info']['image_url'],
  174. as_img_base64=False,
  175. img_path=img_path)
  176. data['product_info']['img_path'] = save_img_path
  177. logger.info(f"{json.dumps(data, indent=4,ensure_ascii=False)}")
  178. return data
  179. async def task():
  180. asin = ['B0CQ1SHD8V', 'B0B658JC22', 'B0DQ84H883', 'B0D44RT8R8']
  181. c = Crawler(ChromeOptions())
  182. file_path = r'G:\code\amazone\copywriting_production\output\page\debug\B0CQ1SHD8V.html.mhtml'
  183. # file_path = r'G:\code\amazone\copywriting_production\output\page\debug\B0CQ1SHD8V.html'
  184. # tab.get(file_path)
  185. c.get(r'G:\code\amazone\copywriting_production\output\page\debug\B0CQ1SHD8V.html.mhtml')
  186. # res = c.download_img(
  187. # 'https://www.asinseed.com/assets/svg/flat-icons/notice.svg?v=20181122',
  188. # upload_s3_dir='s3://public/amazone/copywriting_production/output/B0CQ1SHD8V/')
  189. # logger.info(f"{res}")
  190. # logger.info(f"{res.extracted_content}")
  191. # res = await c.cralw4ai_run(file_path)
  192. # logger.info(f"{res.model_dump()}")
  193. # save_to_file(res.model_dump(), OUTPUT_DIR/'page\debug\B0CQ1SHD8V.json')
  194. return
  195. page = c.run_browser()
  196. tab = page.latest_tab
  197. data = tab.save()
  198. logger.info(f"{type(data)} , {data[:50]}")
  199. save_to_file(data, 's3://public/amazone/copywriting_production/output/B0CQ1SHD8V.html',Metadata={'mykey':'myvalue','mykey2':'myvalue2'})
  200. def main():
  201. asyncio.run(task())
  202. # test()
  203. if __name__ == "__main__":
  204. main()