1 vuosi sitten · 5609abec44
--- a/docs/gpt/crawler_Schema.md
+++ b/docs/gpt/crawler_Schema.md
@@ -111,5 +111,208 @@ type: "list" means multiple items that are simple dictionaries or single text fi
 
				 type: "nested_list" means repeated complex objects (like or ).productsreviews
			
 
				 Base Fields: We can extract attributes from the container element via . For instance, might be . "baseFields""data_cat_id"data-cat-id="elect123"
			
 
				 Transforms: We can also define a if we want to lower/upper case, strip whitespace, or even run a custom function.transform
			
 
				+运行提取
			
 
				+```python
			
 
				+import json
			
 
				+import asyncio
			
 
				+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
			
 
				+from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
			
 
				+ecommerce_schema = {
			
 
				+    # ... the advanced schema from above ...
			
 
				+}
			
 
				+raw_html = '...'
			
 
				+async with AsyncWebCrawler(verbose=True) as crawler:
			
 
				+    strategy = JsonCssExtractionStrategy(ecommerce_schema, verbose=True)
			
 
				+    result = await crawler.arun(
			
 
				+        url=f"raw://{dummy_html}",
			
 
				+        extraction_strategy=strategy,
			
 
				+        config=config
			
 
				+    )
			
 
				+
			
 
				+    if not result.success:
			
 
				+        print("Crawl failed:", result.error_message)
			
 
				+        return
			
 
				+
			
 
				+    # Parse the JSON output
			
 
				+    data = json.loads(result.extracted_content)
			
 
				+    print(json.dumps(data, indent=2) if data else "No data found.")
			
 
				+```
			
 
				+6. 把它们放在一起：更大的例子
			
 
				+考虑一个博客网站。我们有一个模式，用于从每张明信片中提取 URL（通过 an ），以及标题、日期、摘要和作者：baseFields"attribute": "href"
			
 
				+
			
 
				+schema = {
			
 
				+  "name": "Blog Posts",
			
 
				+  "baseSelector": "a.blog-post-card",
			
 
				+  "baseFields": [
			
 
				+    {"name": "post_url", "type": "attribute", "attribute": "href"}
			
 
				+  ],
			
 
				+  "fields": [
			
 
				+    {"name": "title", "selector": "h2.post-title", "type": "text", "default": "No Title"},
			
 
				+    {"name": "date", "selector": "time.post-date", "type": "text", "default": ""},
			
 
				+    {"name": "summary", "selector": "p.post-summary", "type": "text", "default": ""},
			
 
				+    {"name": "author", "selector": "span.post-author", "type": "text", "default": ""}
			
 
				+  ]
			
 
				+}
			
 
				+然后运行 with 以获取一组博客文章对象，每个对象都包含 、 。JsonCssExtractionStrategy(schema)"post_url""title""date""summary""author"
			
 
				+
			
 
				+7. 提示和最佳实践
			
 
				+1. 在 Chrome DevTools 或 Firefox 的 Inspector 中检查 DOM 以找到稳定的选择器。
			
 
				+2. 从简单开始：验证是否可以提取单个字段。然后添加嵌套对象或列表等复杂性。
			
 
				+3. 在大爬虫之前，在部分 HTML 或测试页面上测试你的 schema。
			
 
				+4. 如果网站动态加载内容，则与 JS Execution 结合使用。您可以传入 或 传入 。
			
 
				+5. 在以下情况下查看日志 ：如果您的选择器已关闭或架构格式不正确，它通常会显示警告。
			
 
				+6. 如果您需要容器元素中的属性（例如，， ），请使用 baseFields，尤其是对于 “parent” 项。
			
 
				+7. 性能：对于大页面，请确保您的选择器尽可能窄。js_codewait_forCrawlerRunConfigverbose=Truehrefdata-id
			
 
				+
			
 
				+你只有两种提取策略可用 JsonXPathExtractionStrategy 和 JsonCssExtractionStrategy. 他们不能同时使用在一个 schema 对象中。推荐选择 XPATH 策略。除非 CSS 更优。
			
 
				+--------------
			
 
				+
			
 
				+参考上述说明，帮我从这个html中提取： 
			
 
				+Product Information： 图片链接 、 goto_amazone (class btn-asinseed-link a标签的超链接) 、 主文本内容
			
 
				+Unique Words ： article 里面的所有文本组成列表
			
 
				+
			
 
				+```html
			
 
				+<div class="js-sticky-block" data-has-sticky-header="true" data-offset-target="#logoAndNav" data-sticky-view="lg" data-start-point="#stickyBlockStartPoint" data-end-point="#stickyBlockEndPoint" data-offset-top="32" data-offset-bottom="170">
			
 
				+
			
 
				+                    <div id="div-asin-product-infor">
			
 
				+                        <h3 class="h5 text-asinseed-black font-weight-bold mb-4">Product Information</h3>
			
 
				+                        <article class="mb-5">
			
 
				+                            <div class="d-flex mb-1">
			
 
				+                                <div class="avatar-self-pic mr-3">
			
 
				+                                    <div class="pop-url-imgs" style="background-image: url(https://m.media-amazon.com/images/I/41hY78XIaiL._AC_US600_.jpg)">
			
 
				+                                        </div>
			
 
				+                                    <img class="img-fluid rounded u-xl-avatar item-wh-6r" src="https://m.media-amazon.com/images/I/41hY78XIaiL._AC_US200_.jpg" alt="GOODCHI フレームプロテクター ブレーキケーブルプロテクター 耐摩耗性 柔らかくスパイラル パイプ保護 自転車用 保護スリーブ 10個入">
			
 
				+                                </div>
			
 
				+                                <div class="media-body">
			
 
				+                                    <h4 class="h6 font-weight-normal mb-0">
			
 
				+                                        <a href="https://www.amazon.co.jp/dp/B0CQ1SHD8V" class="small text-muted" target="_blank">GOODCHI</a><br>
			
 
				+                                        GOODCHI フレームプロテクター ブレーキケーブルプロテクター 耐摩耗性 柔らかくスパイラル パイプ保護 自転車用 保護スリーブ 10個入<br>
			
 
				+                                        <span class="small text-muted">B0CQ1SHD8V</span>
			
 
				+                                        <a href="https://www.amazon.co.jp/dp/B0CQ1SHD8V" target="_blank" class="small btn-asinseed-link text-muted" data-toggle="tooltip" title="" data-original-title="View This Product on Amazon"><i class="iconfont icon-to_amazon small"></i></a>
			
 
				+                                    </h4>
			
 
				+                                </div>
			
 
				+                            </div>
			
 
				+                        </article>
			
 
				+                    </div>
			
 
				+
			
 
				+                    <div id="div-asin-variation">
			
 
				+                        <h3 class="h5 text-asinseed-black font-weight-bold mb-4">Variations</h3>
			
 
				+                            <article class="mb-5" data-animation="flash" data-animation-delay="800" data-animation-duration="1500">
			
 
				+                                <ul class="list-unstyled u-list" id="variation-parent-asin" data-asin="B0CQ1SHD8V">
			
 
				+                                    <li class="u-variation-list__link  ">
			
 
				+                                            <a href="https://www.asinseed.com/en/JP/B0DHCWHMM6?utm_asin=B0CQ1SHD8V"><span class="far fa-dot-circle u-list__link-icon mr-1"></span>色: ブラック+レッド
			
 
				+                                                &nbsp;<span class="badge badge-pill" data-asin="B0DHCWHMM6">2</span>
			
 
				+                                                
			
 
				+                                            </a>
			
 
				+                                        </li>
			
 
				+                                        <li class="u-variation-list__link active ">
			
 
				+                                            <a href="https://www.asinseed.com/en/JP/B0CQ1SHD8V?utm_asin=B0CQ1SHD8V"><span class="far fa-dot-circle u-list__link-icon mr-1"></span>色: ブラック
			
 
				+                                                &nbsp;<span class="badge badge-pill" data-asin="B0CQ1SHD8V">12</span>
			
 
				+                                                
			
 
				+                                            </a>
			
 
				+                                        </li>
			
 
				+                                        </ul>
			
 
				+                            </article>
			
 
				+                    </div>
			
 
				+                    <h3 class="h5 text-asinseed-black font-weight-bold mb-4">Unique Words
			
 
				+                        <i class="far fa-question-circle small" data-trigger="hover" role="button" tabindex="1" data-content="Unique Words is the minimal unit of keywords, keywords(user search terms) is composed of them.<br>You can put these words on your listing's title, search term, bullet points and description, let Amazon <br>think such keywords are your product profile, and give your more traffic.<br>We suggest after you review 10+ competitors, then begin to optimize your listing." data-html="true" data-toggle="popover" data-placement="top" data-container="body" data-original-title="" title=""></i>
			
 
				+                        <button class="btn btn-xs u-btn-asinseed keywords-copy-clipboard ml-4" title="" data-clipboard-text="カバー
			
 
				+ロードバイク
			
 
				+ワイヤーガード
			
 
				+プロテクター
			
 
				+フレームパッド
			
 
				+フレームプロテクター
			
 
				+キックガード
			
 
				+プロテクターカバー
			
 
				+車
			
 
				+キルト芯
			
 
				+ケーブルプロテクター
			
 
				+ハンドルカバー" type="button"><i class="ace-icon fa fa-copy bigger-110"></i> Copy to Clipboard</button>
			
 
				+                    </h3>
			
 
				+                    <article class="mb-5">
			
 
				+                        <!-- style="background-color: rgba(182, 214, 249, 0.1);" -->
			
 
				+                                <span class="badge badge-pill badge-asinseed-keywords-weight mb-1 high-frequence-word-level-5">カバー</span>
			
 
				+                            <!-- style="background-color: rgba(182, 214, 249, 0.1);" -->
			
 
				+                                <span class="badge badge-pill badge-asinseed-keywords-weight mb-1 high-frequence-word-level-4">ロードバイク</span>
			
 
				+                            <!-- style="background-color: rgba(182, 214, 249, 0.05);" -->
			
 
				+                                <span class="badge badge-pill badge-asinseed-keywords-weight mb-1 high-frequence-word-level-3">ワイヤーガード</span>
			
 
				+                            <!-- style="background-color: rgba(182, 214, 249, 0.05);" -->
			
 
				+                                <span class="badge badge-pill badge-asinseed-keywords-weight mb-1 high-frequence-word-level-3">プロテクター</span>
			
 
				+                            <!-- style="background-color: rgba(182, 214, 249, 0.05);" -->
			
 
				+                                <span class="badge badge-pill badge-asinseed-keywords-weight mb-1 high-frequence-word-level-2">フレームパッド</span>
			
 
				+                            <!-- style="background-color: rgba(182, 214, 249, 0.05);" -->
			
 
				+                                <span class="badge badge-pill badge-asinseed-keywords-weight mb-1 high-frequence-word-level-2">フレームプロテクター</span>
			
 
				+                            <!-- style="background-color: rgba(182, 214, 249, 0.05);" -->
			
 
				+                                <span class="badge badge-pill badge-asinseed-keywords-weight mb-1 high-frequence-word-level-1">キックガード</span>
			
 
				+                            <!-- style="background-color: rgba(182, 214, 249, 0.05);" -->
			
 
				+                                <span class="badge badge-pill badge-asinseed-keywords-weight mb-1 high-frequence-word-level-1">プロテクターカバー</span>
			
 
				+                            <!-- style="background-color: rgba(182, 214, 249, 0.05);" -->
			
 
				+                                <span class="badge badge-pill badge-asinseed-keywords-weight mb-1 high-frequence-word-level-1">車</span>
			
 
				+                            <!-- style="background-color: rgba(182, 214, 249, 0.05);" -->
			
 
				+                                <span class="badge badge-pill badge-asinseed-keywords-weight mb-1 high-frequence-word-level-1">キルト芯</span>
			
 
				+                            <!-- style="background-color: rgba(182, 214, 249, 0.05);" -->
			
 
				+                                <span class="badge badge-pill badge-asinseed-keywords-weight mb-1 high-frequence-word-level-1">ケーブルプロテクター</span>
			
 
				+                            <!-- style="background-color: rgba(182, 214, 249, 0.05);" -->
			
 
				+                                <span class="badge badge-pill badge-asinseed-keywords-weight mb-1 high-frequence-word-level-1">ハンドルカバー</span>
			
 
				+                            </article>
			
 
				+                    <div id="div-introduce-video">
			
 
				+                        <h3 class="h5 text-asinseed-black font-weight-bold mb-4">AsinSeed Video</h3>
			
 
				+                        <article class="mb-5">
			
 
				+                            <a id="header-help-video-btn" style="outline: 0;">
			
 
				+                                <img src="https://www.asinseed.com/assets/images/video/introduce-video-en-20181122.png" alt="SVG Illustration" style="width: 300px;">
			
 
				+                            </a>
			
 
				+                        </article>
			
 
				+                    </div>
			
 
				+
			
 
				+            </div>
			
 
				+```
			
 
				+
			
 
				+```python
			
 
				+schema = {
			
 
				+            "name": "Product Details",
			
 
				+            "baseSelector": "div.js-sticky-block",
			
 
				+            "fields": [
			
 
				+                {
			
 
				+                    "name": "product_info",
			
 
				+                    "selector": "#div-asin-product-infor",
			
 
				+                    "type": "nested",
			
 
				+                    "fields": [
			
 
				+                        {
			
 
				+                            "name": "image_url",
			
 
				+                            "selector": "div.avatar-self-pic img",
			
 
				+                            "type": "attribute",
			
 
				+                            "attribute": "src"
			
 
				+                        },
			
 
				+                        {
			
 
				+                            "name": "goto_amazon",
			
 
				+                            "selector": "a.btn-asinseed-link",
			
 
				+                            "type": "attribute",
			
 
				+                            "attribute": "href"
			
 
				+                        },
			
 
				+                        {
			
 
				+                            "name": "main_text",
			
 
				+                            "selector": "div.media-body h4",
			
 
				+                            "type": "text",
			
 
				+                            "transform": ["strip"]
			
 
				+                        }
			
 
				+                    ]
			
 
				+                },
			
 
				+                {
			
 
				+                    "name": "unique_words",
			
 
				+                    "selector": "h3:contains('Unique Words') + article",
			
 
				+                    "type": "list",
			
 
				+                    "fields": [
			
 
				+                        {
			
 
				+                            "name": "word",
			
 
				+                            "selector": "span.badge-asinseed-keywords-weight",
			
 
				+                            "type": "text"
			
 
				+                        }
			
 
				+                    ]
			
 
				+                }
			
 
				+            ]
			
 
				+        }
			
 
				+```
			
 
				+
			
 
				+这个没有提取对： {...'unique_words': [{'word': 'カバー'}]}
			
 
				 
			
 
				-参考上述说明，帮我定义Schema 查找该html表格中每个字段的值，包含超链接。
			
 
				+理论上到列表字符串才对。因为我看到 article 下都是 span 标签。
			
--- a/src/browser/crawl_asin.py
+++ b/src/browser/crawl_asin.py
@@ -16,7 +16,7 @@ import ssl
 
				 from sqlmodel import select, Session
			
 
				 from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, CrawlResult
			
 
				 from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
			
 
				-from crawl4ai.extraction_strategy import JsonCssExtractionStrategy,JsonXPathExtractionStrategy
			
 
				+from crawl4ai.extraction_strategy import JsonCssExtractionStrategy,JsonXPathExtractionStrategy,ExtractionStrategy
			
 
				 from crawl4ai.content_filter_strategy import BM25ContentFilter
			
 
				 from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
			
 
				 
			
@@ -50,6 +50,9 @@ class Crawler():
 
				                 self.chrome_options 
			
 
				             )
			
 
				         self.page.get(url)
			
 
				+        self.browser_config.update({
			
 
				+            "cdp_url": self.page.browser._driver._websocket_url 
			
 
				+        })
			
 
				     
			
 
				     async def run(self, url:str):
			
 
				         page = load_chrome_from_ini(
			
@@ -101,12 +104,22 @@ class Crawler():
 
				         save_path = save_path or str(ASIN_HTML_DIR / f'{asin}.html')
			
 
				         return save_to_file(data, save_path)
			
 
				     
			
 
				-    async def cralw4ai_run(self, uri:str) -> CrawlResult:
			
 
				+    async def excra_strategy_raw_html(self, raw_html:str, schema:dict, strategy:ExtractionStrategy=JsonXPathExtractionStrategy):
			
 
				         browser_config = BrowserConfig(
			
 
				             **self.browser_config,
			
 
				-            cdp_url = self.page.browser._driver._websocket_url
			
 
				         )
			
 
				-        schema = {
			
 
				+        async with AsyncWebCrawler(config=browser_config) as crawler:
			
 
				+            result:CrawlResult = await crawler.arun(
			
 
				+                url=f"raw://{raw_html}",
			
 
				+                config=CrawlerRunConfig(
			
 
				+                    cache_mode=CacheMode.BYPASS,
			
 
				+                    extraction_strategy=strategy(schema,verbose=False)
			
 
				+                )
			
 
				+            )
			
 
				+            return result
			
 
				+
			
 
				+    async def extra_result_table(self, html:str, input_schema:dict={}) -> CrawlResult:
			
 
				+        schema = input_schema or{
			
 
				             "baseSelector": "table.table tbody tr",  # 每行数据对应一个tr
			
 
				             "fields": [
			
 
				                 {
			
@@ -146,24 +159,58 @@ class Crawler():
 
				             ]
			
 
				         }
			
 
				         
			
 
				-        dummy_html = self.page.html
			
 
				-        raw_url = f"raw://{dummy_html}"
			
 
				-        async with AsyncWebCrawler(config=browser_config) as crawler:
			
 
				-            result:CrawlResult = await crawler.arun(
			
 
				-                url=raw_url,
			
 
				-                config=CrawlerRunConfig(
			
 
				-                    cache_mode=CacheMode.BYPASS,
			
 
				-                    extraction_strategy=JsonCssExtractionStrategy(schema,verbose=True)
			
 
				-                )
			
 
				-            )
			
 
				+        result = await self.excra_strategy_raw_html(html, schema, JsonCssExtractionStrategy)
			
 
				+        if not result.success:
			
 
				+            logger.error(f"Crawl failed: {result.error_message}")
			
 
				+            return
			
 
				+        data = json.loads(result.extracted_content)
			
 
				+        logger.info(f"Extracted {len(data)} coin rows")
			
 
				+        logger.debug(f"First item: {result.extracted_content}")
			
 
				+        return data
			
 
				+    
			
 
				+    async def excra_product_info(self, html:str, input_schema:dict={}, strategy:ExtractionStrategy=JsonXPathExtractionStrategy) -> CrawlResult:
			
 
				+        schema = input_schema or {
			
 
				+            "name": "Product Details",
			
 
				+            "baseSelector": "div.js-sticky-block",
			
 
				+            "fields": [
			
 
				+                {
			
 
				+                    "name": "product_info",
			
 
				+                    "selector": "#div-asin-product-infor",
			
 
				+                    "type": "nested",
			
 
				+                    "fields": [
			
 
				+                        {
			
 
				+                            "name": "image_url",
			
 
				+                            "selector": "div.avatar-self-pic img",
			
 
				+                            "type": "attribute",
			
 
				+                            "attribute": "src"
			
 
				+                        },
			
 
				+                        {
			
 
				+                            "name": "goto_amazon",
			
 
				+                            "selector": "a.btn-asinseed-link",
			
 
				+                            "type": "attribute",
			
 
				+                            "attribute": "href"
			
 
				+                        },
			
 
				+                        {
			
 
				+                            "name": "main_text",
			
 
				+                            "selector": "div.media-body h4",
			
 
				+                            "type": "text",
			
 
				+                            "transform": ["strip"]
			
 
				+                        }
			
 
				+                    ]
			
 
				+                },
			
 
				+                {
			
 
				+                    "name": "unique_words",
			
 
				+                    "selector": "h3:has(+ article) + article span.badge-asinseed-keywords-weight",  # 精准定位到目标article
			
 
				+                    "type": "list",
			
 
				+                    "fields": [
			
 
				+                        {"name": "word", "type": "text"}  # 保持字段结构
			
 
				+                    ],
			
 
				+                    "transform": ["extract_list"]  # 添加转换器将对象列表转为纯文本列表
			
 
				+                }
			
 
				+            ]
			
 
				+        }
			
 
				+        return await self.excra_strategy_raw_html(html, schema, JsonCssExtractionStrategy)
			
 
				 
			
 
				-            if not result.success:
			
 
				-                logger.error(f"Crawl failed: {result.error_message}")
			
 
				-                return
			
 
				-            data = json.loads(result.extracted_content)
			
 
				-            logger.info(f"Extracted {len(data)} coin rows")
			
 
				-            logger.debug(f"First item: {result.extracted_content}")
			
 
				-            return data
			
 
				 async def task():
			
 
				     asin = ['B0CQ1SHD8V', 'B0B658JC22', 'B0DQ84H883', 'B0D44RT8R8']
			
 
				     c = Crawler(ChromeOptions())
			
@@ -171,9 +218,11 @@ async def task():
 
				     # file_path = r'G:\code\amazone\copywriting_production\output\page\debug\B0CQ1SHD8V.html'
			
 
				     # tab.get(file_path)
			
 
				     c.get(r'G:\code\amazone\copywriting_production\output\page\debug\B0CQ1SHD8V.html.mhtml')
			
 
				-    res = await c.cralw4ai_run(file_path)
			
 
				+    res = await c.excra_product_info(c.page.html)
			
 
				+    logger.info(f"{json.loads(res.extracted_content)}")
			
 
				+    
			
 
				+    # res = await c.cralw4ai_run(file_path)
			
 
				     # logger.info(f"{res.model_dump()}")
			
 
				-    # logger.info(f"{json.loads(res.extracted_content)}")
			
 
				     # save_to_file(res.model_dump(), OUTPUT_DIR/'page\debug\B0CQ1SHD8V.json')
			
 
				     return
			
 
				     page = c.run_browser()
			
--- a/tests/mytest/t_crawler.py
+++ b/tests/mytest/t_crawler.py
@@ -1,4 +1,15 @@
 
				 
			
 
				+
			
 
				+def download_img():
			
 
				+    from DrissionPage import SessionPage
			
 
				+
			
 
				+    c = Crawler(ChromeOptions())
			
 
				+    url = 'https://m.media-amazon.com/images/I/41hY78XIaiL._AC_US200_.jpg'
			
 
				+    save_path = r'G:\code\amazone\copywriting_production\output\page\temp'
			
 
				+
			
 
				+    res = c.page.download(url, save_path)
			
 
				+    logger.info(f"{res}")
			
 
				+
			
 
				 async def task():
			
 
				     asin = ['B0CQ1SHD8V', 'B0B658JC22', 'B0DQ84H883', 'B0D44RT8R8']
			
 
				     c = Crawler(ChromeOptions())