search_keyward.py 7.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184
  1. import asyncio
  2. import re
  3. from crawl4ai import *
  4. from pathlib import Path
  5. import json
  6. from lxml import html # 使用 lxml.html 模块
  7. from sqlmodel import Session, select
  8. from mylib.base import (replace_space, save_to_file, save_all_result,
  9. save_to_pickle,ensure_output_dir,
  10. save_base64_to_file,browser_config)
  11. from mylib.drission_page import load_chrome_from_ini
  12. from database.excel_import import ExcelDatabaseManager,KeywordModel
  13. from config.settings import GOOGLE_SEARCH_DIR
  14. from mylib.crawl_lib_func import filter_links,filter_local_domain
  15. page = load_chrome_from_ini()
  16. async def google_search(url:str, config=None)->CrawlResult:
  17. run_config = CrawlerRunConfig(
  18. magic=True,
  19. simulate_user=True,
  20. override_navigator=True
  21. )
  22. async with AsyncWebCrawler(config=browser_config) as crawler:
  23. result = await crawler.arun(
  24. url=url,
  25. cache_mode=CacheMode.ENABLED,
  26. user_agent='random',
  27. config=run_config,
  28. )
  29. # save_to_pickle(result, GOOGLE_SEARCH_DIR / f"{search_key}.pickle")
  30. return result
  31. def is_search_result_empty(html_content: str) -> bool:
  32. '''
  33. 检查页面是否存在 id="search" 的元素
  34. 并检查其是否有子元素
  35. 如果没有子元素则返回 True 表示搜索结果为空
  36. '''
  37. tree = html.fromstring(html_content)
  38. search_elements = tree.xpath('//*[@id="search"]/*')
  39. return len(search_elements) == 0
  40. def is_already_processed(keyword: str) -> bool:
  41. """检查关键词是否已处理"""
  42. save_dir = GOOGLE_SEARCH_DIR / replace_space(keyword) / 'pkl'
  43. return save_dir.exists() and any(save_dir.glob("*.pickle"))
  44. async def process_keyword(keyword: str, start=0, pages_num=250, cache=True, skip_exist=True):
  45. global page
  46. """处理单个关键词"""
  47. # keyword = replace_space(keyword)
  48. save_dir = GOOGLE_SEARCH_DIR / keyword
  49. ensure_output_dir(save_dir)
  50. # # 如果已经处理过,直接返回保存目录
  51. # if is_already_processed(keyword):
  52. # print(f"关键词 {keyword} 已处理,跳过搜索")
  53. # return save_dir
  54. # 未处理过,执行搜索
  55. for i in range(start, pages_num, 10):
  56. save_html_path = GOOGLE_SEARCH_DIR / keyword / f"{i}.html"
  57. url = f"https://www.google.com/search?q={keyword}&start={i}"
  58. print(f"search url: {url}")
  59. # 如果缓存文件存在,直接读取
  60. # if skip_exist and save_html_path.exists():
  61. # print(f"跳过缓存文件 {save_html_path}")
  62. # continue
  63. # print(f"读取缓存文件 {save_html_path}")
  64. # else:
  65. # page.get(url)
  66. # save_to_file(page.html,save_html_path)
  67. # # result: CrawlResult = await google_search(url)
  68. # # 保存 HTML 文件
  69. # # save_to_file(result.html, save_html_path)
  70. # print(f"保存 HTML 文件 {save_html_path}")
  71. # url = f"file://{save_html_path}"
  72. result: CrawlResult = await google_search(url)
  73. # 漂亮打印 result.links
  74. # print(json.dumps(result.links, indent=4))
  75. save_json_path = save_to_file(json.dumps(result.links, indent=4), save_dir / f"links-{i}.json")
  76. print(f"保存 links 文件 {save_json_path}")
  77. # if is_search_result_empty(result.html):
  78. search_res_links = filter_links(result.links)
  79. if not search_res_links:
  80. print(f"没有找到多余的页面 {result.url},退出")
  81. break
  82. # links = filter_links(result.links)
  83. # print(f"start: {i}, links: {links} \n len: {len(links)}")
  84. # save_to_pickle(result, save_dir / f"result-{i}.pickle")
  85. return save_dir
  86. async def search_all():
  87. excel_db_manager = ExcelDatabaseManager()
  88. """处理所有未完成的关键词"""
  89. key_model_list = excel_db_manager.get_keywords_by_status()
  90. for keyword_model in key_model_list:
  91. # if is_already_processed(keyword):
  92. # print(f"关键词 {keyword} 已处理,跳过")
  93. # continue
  94. await process_keyword(keyword_model.key_word)
  95. async def test_single_search():
  96. # await process_keyword("Acalypha malabarica essential oil", start=0, pages_num=250)
  97. # save_html_path = Path(r'K:\code\upwork\zhang_crawl_bio\output\debug\正常的搜索结果.html')
  98. # save_html_path = Path(r'K:\code\upwork\zhang_crawl_bio\output\debug\查询不到内容.html')
  99. # save_html_path = Path(r"K:\code\upwork\zhang_crawl_bio\output\debug\流量异常.html")
  100. # save_html_path = Path(r"K:\code\upwork\zhang_crawl_bio\output\debug\最后一页.html")
  101. save_html_path = Path(r"K:\code\upwork\zhang_crawl_bio\output\debug\只有一页.html")
  102. save_dir = save_html_path.parent
  103. file_name = save_html_path.name
  104. url = f"file://{save_html_path}"
  105. result: CrawlResult = await google_search(url)
  106. # 漂亮打印 result.links
  107. # print(json.dumps(result.links, indent=4))
  108. save_json_path = save_to_file(json.dumps(result.links, indent=4), save_dir / f"links-{file_name}.json")
  109. print(f"保存 result.links 文件 {save_json_path}")
  110. links = filter_links(result.links)
  111. # print('\n -----------------links:')
  112. # print(json.dumps(links, indent=4))
  113. save_json_path = save_to_file(json.dumps(links, indent=4), save_dir / f"links-{file_name}-filter.json")
  114. links_not_local = filter_local_domain(links)
  115. print(f"保存 links_not_local.json 文件 {save_json_path}")
  116. # print('\n -----------------links_not_local:')
  117. # print(json.dumps(links_not_local, indent=4))
  118. # print(f"len links_not_local: {len(links_not_local)}")
  119. save_links_not_local_path = save_to_file(json.dumps(links_not_local, indent=4), save_dir / f"links-{file_name}-filter-not-local.json")
  120. print(f"保存 links-{file_name}-filter-not-local.json 文件 {save_links_not_local_path}")
  121. # print(f"start: {i}, links: {links} \n len: {len(links)}")
  122. # result = await google_search("Acalypha malabarica essential oil", start=50)
  123. # print(f"result clean html:\n {result.cleaned_html}")
  124. # print(f"result.links\n {result.links['external']}")
  125. # res = filter_links(result.links)
  126. # print(res)
  127. # 漂亮打印
  128. # print(json.dumps(res, indent=4))
  129. async def test_html_to_doc():
  130. save_dir = Path(r"K:\code\upwork\zhang_crawl_bio\output\google_search")
  131. # 获取所有文件夹
  132. folders = [f for f in save_dir.iterdir() if f.is_dir()]
  133. print(folders)
  134. for folder in folders:
  135. # 获取文件夹下的所有 html 文件
  136. html_files = [f for f in folder.iterdir() if f.suffix == '.html']
  137. async def test_sigle_html_links(save_html_path=None):
  138. if not save_html_path:
  139. save_html_path = Path(r"K:\code\upwork\zhang_crawl_bio\output\Acalypha malabarica essential oil\0.html")
  140. url = f"file://{save_html_path}"
  141. result: CrawlResult = await google_search(url)
  142. not_google_external_links = filter_links(result.links)
  143. links_not_local = filter_local_domain(not_google_external_links)
  144. # print(links_not_local)
  145. return links_not_local
  146. async def test_dir_links_not_local(dir_path:Path):
  147. '''
  148. 获取所有目录中的 html 文件,解析得到过滤后的域名列表, base_domain 不包含本地路径
  149. '''
  150. html_files = [f for f in dir_path.iterdir() if f.suffix == '.html']
  151. all_links = []
  152. for html_file in html_files:
  153. print(f"Processing {html_file}")
  154. links = await test_sigle_html_links(html_file)
  155. print(f"Found {len(links)} links in {html_file}")
  156. all_links.extend(links)
  157. print(f"Found {len(all_links)} links in total")
  158. # 按行写入到文件中
  159. save_to_file(all_links, dir_path / "links.json.txt")
  160. return all_links
  161. async def main():
  162. # await search_all()
  163. # await test_single_search()
  164. await test_dir_links_not_local(Path(r"K:\code\upwork\zhang_crawl_bio\output\Acalypha malabarica essential oil"))
  165. if __name__ == "__main__":
  166. asyncio.run(main())