Преглед изворни кода

加入 dristion page 后关键词搜索

mrh пре 10 месеци
родитељ
комит
78e12d7b83
15 измењених фајлова са 245 додато и 38 уклоњено
  1. 1 0
      .env
  2. 2 1
      .gitignore
  3. 0 0
      __init__.py
  4. 1 1
      article.py
  5. 24 0
      database/sql_model.py
  6. 71 0
      drission_page_process.py
  7. 8 2
      mylib/base.py
  8. 34 0
      mylib/conf/9321.ini
  9. 26 0
      mylib/drission_page.py
  10. 4 0
      mylib/settings.py
  11. 43 0
      mytest/playwright_run_path.py
  12. 0 0
      mytest/playwright_t.py
  13. 1 1
      mytest/t.py
  14. 17 0
      mytest/xpath_search.py
  15. 13 33
      search_keyward.py

+ 1 - 0
.env

@@ -0,0 +1 @@
+PYTHONPATH=.

+ 2 - 1
.gitignore

@@ -1,3 +1,4 @@
 output
 venv
-__pycache__
+__pycache__
+.vscode


+ 1 - 1
article.py

@@ -1,7 +1,7 @@
 import asyncio
 from crawl4ai import *
 from get_article_info import get_rearch_result_links
-from base import save_to_file, save_all_result,OUTPUT_DIR,load_from_pickle
+from mylib.base import save_to_file, save_all_result,OUTPUT_DIR,load_from_pickle
 
 
 async def main():

+ 24 - 0
database/sql_model.py

@@ -0,0 +1,24 @@
+from sqlite3 import connect
+from mylib.settings import OUTPUT_DIR, WORK_DIR
+from sqlmodel import Field, SQLModel, create_engine
+sqlite_file_name = OUTPUT_DIR / "database.db"
+sqlite_url = f"sqlite:///{sqlite_file_name}"
+engine = create_engine(sqlite_url, echo=True)
+
+class Keyword(SQLModel, table=True):
+    id: int = Field(default=None, primary_key=True)
+    key_word: str = Field()
+    totoal_pages:int = Field(nullable=True)
+    done:bool = Field(default=False)
+
+def add_or_update():
+    pass
+def create_db_and_tables():
+    SQLModel.metadata.create_all(engine)
+
+def main():
+    create_db_and_tables()
+    
+
+if __name__ == "__main__":
+    main()

+ 71 - 0
drission_page_process.py

@@ -0,0 +1,71 @@
+import asyncio
+from crawl4ai import *
+from pathlib import Path
+import json
+from DrissionPage import Chromium, ChromiumOptions, ChromiumPage
+from mylib.base import save_to_file, save_all_result,OUTPUT_DIR,save_to_pickle,ensure_output_dir,save_base64_to_file,browser_config,replace_space
+from mylib.drission_page import load_chrome_from_ini
+from search_keyward import filter_links
+from lxml import html  # 使用 lxml.html 模块
+
+page = load_chrome_from_ini()
+
+async def aprocess_html(url:str, html:str=''):
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        result = await crawler.arun(
+            url=url,
+            cache_mode=CacheMode.DISABLED,
+        )
+        return result
+    
+def get_if_cache_exist(page:ChromiumPage, save_to_html_path:Path, url:str):
+    if save_to_html_path.exists():
+        page.get(save_to_html_path)
+    else:
+        page.get(url)
+
+def is_search_result_empty(html_content:str):
+    ''' 当页面为空时,会存在 <div class="card-section"> 元素 '''
+    tree = html.fromstring(html_content)
+
+    # 使用 XPath 查找所有 class 包含 "card-section" 的 div 元素
+    card_sections = tree.xpath("//div[contains(@class, 'card-section')]")
+    print(f"card_sections {card_sections}")
+    return len(card_sections) != 0
+async def search_all(cache=True):
+    search_key = 'Acalypha malabarica essential oil'
+    start = 30
+    search_key = replace_space(search_key)
+    url = f"https://www.google.com/search?q={search_key}&start={start}"
+    print(f"search url: {url}")
+    
+    sav_search_key_dir = OUTPUT_DIR / search_key
+    save_to_html_path = sav_search_key_dir / f"{start}.html"
+    if not cache or not save_to_html_path.exists():
+        page.get(url)
+        print(f"save to {save_to_html_path}")
+        save_to_file(page.html,save_to_html_path)
+        html_content = page.html
+    else:
+        with open(save_to_html_path, 'r', encoding='utf-8') as f:
+            html_content = f.read()
+
+    if is_search_result_empty(html_content):
+        save_path = page.get_screenshot(sav_search_key_dir / f"{start}.png")
+        print(f"没有找到多余的页面 {url},退出")
+        print(f"screenshot saved to {save_path}")
+    else:    
+        file_html = f"file://{save_to_html_path}"
+        result:CrawlResult = await aprocess_html(file_html)
+        # result = await aprocess_html(url)
+        # print(f"result.cleaned_html \n{result.cleaned_html}")
+        # print(f"result.links: {len(result.links)}\n {result.links}")
+        linkes = filter_links(result.links)
+        # print(f"linkes: {len(linkes)}\n {linkes}")
+    
+    
+async def main():
+    await search_all()
+    
+if __name__ == "__main__":
+    asyncio.run(main())

+ 8 - 2
base.py → mylib/base.py

@@ -16,7 +16,6 @@ browser_config = BrowserConfig(
     verbose=True,
     # use_persistent_context=True,
     user_data_dir=OUTPUT_DIR / "user_data_dir"
-
                                )
 # 如果目录不存在则创建
 def ensure_output_dir(output_dir: Path):
@@ -24,7 +23,8 @@ def ensure_output_dir(output_dir: Path):
         output_dir.mkdir(parents=True, exist_ok=True)
 
 # 保存到文件
-def save_to_file(content, filename):
+def save_to_file(content, filename:Path):
+    ensure_output_dir(filename.parent)
     if not isinstance(content, str):
     # 如果可以用 json 格式化,则格式化
         try:
@@ -57,3 +57,9 @@ def save_all_result(result:CrawlResult, output_dir):
         if hasattr(result, attr) and not attr.startswith("__"):
             format_str = f"{attr}: {result.__getattribute__(attr)}"
             save_to_file(format_str, output_dir / f"{attr}.txt")
+
+# 如果字符串有空格,替换成加号
+def replace_space(search_key:str):
+    if search_key.find(" ") != -1:
+        return search_key.replace(" ", "+")
+    return search_key

+ 34 - 0
mylib/conf/9321.ini

@@ -0,0 +1,34 @@
+[paths]
+download_path = .
+tmp_path = 
+
+[chromium_options]
+address = 127.0.0.1:9321
+browser_path = C:\Program Files\Google\Chrome\Application\chrome.exe
+arguments = ['--no-default-browser-check', '--disable-suggestions-ui', '--no-first-run', '--disable-infobars', '--disable-popup-blocking', '--hide-crash-restore-bubble', '--disable-features=PrivacySandboxSettings4', '--user-data-dir=K:\\code\\upwork\\zhang_crawl_bio\\output\\user_data_dir2', '--blink-settings=imagesEnabled=false', '--mute-audio']
+extensions = []
+prefs = {'profile.default_content_settings.popups': 0, 'profile.default_content_setting_values': {'notifications': 2}}
+flags = {}
+load_mode = normal
+user = Default
+auto_port = False
+system_user_path = False
+existing_only = False
+new_env = False
+
+[session_options]
+headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.1.2 Safari/603.3.8', 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'connection': 'keep-alive', 'accept-charset': 'GB2312,utf-8;q=0.7,*;q=0.7'}
+
+[timeouts]
+base = 10
+page_load = 30
+script = 30
+
+[proxies]
+http = 
+https = 
+
+[others]
+retry_times = 3
+retry_interval = 2
+

+ 26 - 0
mylib/drission_page.py

@@ -0,0 +1,26 @@
+from DrissionPage import Chromium, ChromiumOptions, ChromiumPage
+from pathlib import Path
+from mylib.settings import OUTPUT_DIR, WORK_DIR, CONFIG_DIR
+
+BROWSER_PATH=r"C:\Program Files\Google\Chrome\Application\chrome.exe"
+
+def genarate_chrome_ini(address="localhost:9321"):
+    port = address.split(':')[1]
+    chrome_options = ChromiumOptions().set_browser_path(BROWSER_PATH)
+    chrome_options.set_address(address)
+    chrome_options.set_user_data_path(str(OUTPUT_DIR / 'user_data_dir2'))
+    chrome_options.no_imgs(True).mute(True)
+    chrome_options.save(CONFIG_DIR / f'{port}.ini')
+
+def load_chrome_from_ini(path=CONFIG_DIR / '9321.ini'):
+    chrome_options = ChromiumOptions(ini_path=path)
+    page = ChromiumPage(chrome_options)
+    return page
+    
+def main():
+    # genarate_chrome_ini()
+    page = load_chrome_from_ini()
+    page.get("https://www.google.com/search?q=acalypha+malabarica+essential+oil&start=30")
+
+if __name__ == "__main__":
+    main()

+ 4 - 0
mylib/settings.py

@@ -0,0 +1,4 @@
+from pathlib import Path
+WORK_DIR = Path(__file__).parent.parent.absolute()
+OUTPUT_DIR = WORK_DIR / "output"
+CONFIG_DIR = WORK_DIR / "mylib" / "conf"

+ 43 - 0
mytest/playwright_run_path.py

@@ -0,0 +1,43 @@
+import asyncio
+from playwright.async_api import async_playwright, Playwright
+from pathlib import Path
+from playwright.async_api import Playwright, Error as PlaywrightError
+
+async def run(playwright: Playwright):
+    chromium = playwright.chromium # or "firefox" or "webkit".
+    chrome_exe = Path(r"C:\Program Files\Google\Chrome\Application\chrome.exe")
+    print(f"chrome_exe.exists() {chrome_exe.exists()}")
+    browser = await chromium.launch(
+        executable_path=chrome_exe,
+        headless=False
+        )
+    print(f"executable_path: {chromium.executable_path}")
+    page = await browser.new_page()
+    await page.goto("http://example.com")
+    body = await page.inner_html("body")
+    print(body)
+    # other actions...
+    # await browser.close()
+async def connect(playwright: Playwright, port=9321, host='localhost'):
+    retry = 0
+    while retry < 5:
+        try:
+            browser = await playwright.chromium.connect_over_cdp(f"http://{host}:{port}")
+        # 捕获这个失败 playwright._impl._errors.Error: BrowserType.connect_over_cdp: connect ECONNREFUSED ::1:932
+        except PlaywrightError as e:
+            # 捕获 Playwright 的异常
+            print(f"Caught a Playwright error: {e}")
+            print("connect_over_cdp error")
+            asyncio.sleep(0.5)
+            retry += 1
+            return
+    context = await browser.new_context()
+    page = await context.new_page()
+    await page.goto("https://cn.bing.com/")
+    print(await page.title())
+    print(await page.inner_html())
+
+async def main():
+    async with async_playwright() as playwright:
+        await connect(playwright)
+asyncio.run(main())

+ 0 - 0
playwright_t.py → mytest/playwright_t.py


+ 1 - 1
t.py → mytest/t.py

@@ -1,6 +1,6 @@
 import asyncio
 from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
-from base import OUTPUT_DIR
+from mylib.base import OUTPUT_DIR
 
 async def main():
     # 1) Reference your persistent data directory

+ 17 - 0
mytest/xpath_search.py

@@ -0,0 +1,17 @@
+from lxml import html  # 使用 lxml.html 模块
+
+# 读取 HTML 文件
+save_to_html_path = r'K:\code\upwork\zhang_crawl_bio\output\Acalypha+malabarica+essential+oil\30.html'
+with open(save_to_html_path, 'r', encoding='utf-8') as f:
+    html_content = f.read()
+
+# 使用 lxml.html 解析 HTML
+tree = html.fromstring(html_content)
+
+# 使用 XPath 查找所有 class 包含 "card-section" 的 div 元素
+card_sections = tree.xpath("//div[contains(@class, 'card-section')]")
+
+# 输出结果
+for section in card_sections:
+    # 使用 text_content() 获取当前元素及其子元素的文本内容
+    print(section.text_content())

+ 13 - 33
search_keyward.py

@@ -2,40 +2,20 @@ import asyncio
 from crawl4ai import *
 from pathlib import Path
 import json
-from base import save_to_file, save_all_result,OUTPUT_DIR,save_to_pickle,ensure_output_dir,save_base64_to_file,browser_config
-
-# 如果字符串有空格,替换成加号
-def replace_space(search_key:str):
-    if search_key.find(" ") != -1:
-        return search_key.replace(" ", "+")
-    return search_key
+from mylib.base import save_to_file, save_all_result,OUTPUT_DIR,save_to_pickle,ensure_output_dir,save_base64_to_file,browser_config
+from mylib.drission_page import load_chrome_from_ini
 async def google_search(search_key:str, start:int=0, config=None)->CrawlResult:
-    search_key = replace_space(search_key)
-    crawler = AsyncWebCrawler(config=browser_config)
-    # await crawler.crawler_strategy.start()
-    # input("crawler.crawler_strategy.start() press enter to continue")
-    await crawler.start()
-    input("crawler.start() press enter to continue")
-    # result = await crawler.arun(
-    #         url = f"https://www.google.com/search?q={search_key}&start={start}",
-    #         cache_mode=CacheMode.DISABLED,
-    #         config=config,
-    #     )
-    # await crawler.close()
-    # return
-    
-    
-    
-    # async with AsyncWebCrawler(config=browser_config) as crawler:
-    #     url = f"https://www.google.com/search?q={search_key}&start={start}"
-    #     print(f"search url: {url}")
-    #     result = await crawler.arun(
-    #         url=url,
-    #         cache_mode=CacheMode.DISABLED,
-    #         config=config,
-    #     )
-    #     # save_to_pickle(result, OUTPUT_DIR / f"{search_key}.pickle")
-    #     return result
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        url = f"https://www.google.com/search?q={search_key}&start={start}"
+        print(f"search url: {url}")
+        result = await crawler.arun(
+            url=url,
+            cache_mode=CacheMode.DISABLED,
+            config=config,
+        )
+        # save_to_pickle(result, OUTPUT_DIR / f"{search_key}.pickle")
+        return result
+
 def filter_links(links):
     '''
     input: {