Browse Source

导出数据,后续测试,临时保存

mrh 9 months ago
parent
commit
1835feda6a

+ 3 - 1
.gitignore

@@ -7,4 +7,6 @@ __pycache__
 .pytest_cache
 .pytest_cache
 download
 download
 local_proxy_pool/
 local_proxy_pool/
-*.rdb
+*.rdb
+.env
+CF-Clearance-Scraper

+ 81 - 0
architecture.md

@@ -0,0 +1,81 @@
+
+
+# 测试
+```shell
+C:\Users\mg\.local\bin\aider.exe
+
+python -m pytest tests/test_google_search.py -v
+```
+
+# 工具
+
+## 架构相关
+3.3k ⭐  智能打开 s3 、hdfs 、 sftp 、 ftp 、 local 文件系统
+https://github.com/piskvorky/smart_open
+
+2.2k ⭐  PyFilesystem2 是一个抽象的文件系统接口
+https://github.com/PyFilesystem/pyfilesystem2
+
+
+## 爬虫和解析
+6.6k ⭐  爬虫框架大全
+https://github.com/BruceDone/awesome-crawler
+
+爬虫工具大全,搜索: 爬
+https://github.com/GitHubDaily/GitHubDaily/blob/cb618c17a72fc5a62248e5ac863d46fe0164487b/README.md?plain=1#L190
+
+
+330 ⭐  awesome 网页解析器数据提取大全
+https://github.com/kimtth/awesome-azure-openai-llm/blob/9b16663bb4e38bc8760f3f274b92dfcca0ada34a/section/app.md
+关键词: https://github.com/search?q=Trafilatura+awesome++language%3AMarkdown&type=code&l=Markdown
+
+
+
+
+34.9k ⭐ markitdown
+https://github.com/microsoft/markitdown
+
+22k ⭐ firecrawl AI 抓取干净结构化的页面 
+https://github.com/mendableai/firecrawl
+
+17.3k ⭐  python 用AI自动抓取网页信息,自动解析 markdown ,自定义提取的字段
+还能生成代码,为页面固定运行代码
+https://github.com/ScrapeGraphAI/Scrapegraph-ai/blob/main/README.md
+
+7.5K ⭐  无代码,鼠标点击元素,即可提取所有相似数据转换成表格或者 json , 或者 API 接口
+https://github.com/getmaxun/maxun
+
+241 ⭐  在代码中用自然语言描述,就能让AI提取有关数据,还能转换为 markdown ,也能用自然语言描述来自动化输入、回车、提交
+需要 LLM 秘钥
+https://github.com/dendrite-systems/dendrite-python-sdk
+
+readerLM-v2
+
+3.1k⭐ 网页解析器
+https://github.com/adbar/trafilatura
+各类工具的评估结果,我们可以看看哪些工具最强
+https://trafilatura.readthedocs.io/en/latest/evaluation.html#results-2022-05-18
+
+
+339 ⭐ 文章提取器,这是一个论文和评估基准
+https://github.com/scrapinghub/article-extraction-benchmark
+
+1.4k ⭐  快如闪电的解析器,比 bs4 快240倍 ,
+可以进行相似元素的搜索,加快搜索效率,智能导航,可以快速跳转到父级、子级、兄弟元素
+假如元素属性发生改变,它可以智能识别改变后的元素
+https://github.com/D4Vinci/Scrapling
+
+### 反机器人检测的浏览器
+365 ⭐  给原装的playwright打补丁,能够避免检测到自动化
+https://github.com/rebrowser/rebrowser-playwright-python
+
+0.98k ⭐  反机器人检测的浏览器
+https://github.com/daijro/camoufox
+浏览器启动信息: about:support
+
+机器人检测的网站,测试用
+https://www.browserscan.net/bot-detection
+"https://bot.sannysoft.com/"
+
+检查你的 代理ip 和浏览器指纹真伪
+https://www.browserscan.net

+ 79 - 0
tests/test_converter_base.py

@@ -0,0 +1,79 @@
+import pytest
+from pathlib import Path
+from worker.html_convert.converter_base import ConverterBase
+
+class TestConverterBase:
+    """Test suite for ConverterBase class"""
+    
+    @pytest.fixture
+    def converter(self):
+        return ConverterBase()
+    
+    def test_extract_content_after_first_h1(self, converter):
+        """Test extracting content after first H1"""
+        sample_md = """
+Some header content to skip
+
+## PERMALINK
+Copy
+
+# Main Title Here
+
+Content starts here
+"""
+        expected = "# Main Title Here\n\nContent starts here"
+        result = converter.extract_content_after_first_h1(sample_md)
+        assert result.strip() == expected.strip()
+        
+    def test_fix_inline_links(self, converter):
+        """Test fixing inline links"""
+        # Test case 1: Relative URL with domain
+        sample_md_1 = "[Author Name](https://example.com/<https://actual.com/path>)"
+        expected_1 = "[Author Name](https://actual.com/path)"
+        assert converter.fix_inline_links(sample_md_1) == expected_1
+        
+        # Test case 2: Absolute URL
+        sample_md_2 = "[PMC Copyright](https://pmc.ncbi.nlm.nih.gov/articles/PMC9919988/</about/copyright/>)"
+        expected_2 = "[PMC Copyright](https://pmc.ncbi.nlm.nih.gov/articles/PMC9919988/about/copyright/)"
+        assert converter.fix_inline_links(sample_md_2) == expected_2
+        
+        # Test case 3: Already correct link
+        sample_md_3 = "[Normal Link](https://correct.com/path)"
+        expected_3 = "[Normal Link](https://correct.com/path)"
+        assert converter.fix_inline_links(sample_md_3) == expected_3
+        
+        # Test case 4: Image link with empty alt text
+        sample_md_4 = "![](https://pub.mdpi-res.com/img/table.png) [](https://www.mdpi.com/1420-3049/29/22/<#table_body_display_molecules-29-05310-t003>)"
+        expected_4 = "![](https://pub.mdpi-res.com/img/table.png) [](https://www.mdpi.com/1420-3049/29/22/#table_body_display_molecules-29-05310-t003)"
+        assert converter.fix_inline_links(sample_md_4) == expected_4
+    
+    def test_add_url_header(self, converter):
+        """Test adding URL header"""
+        content = "Some markdown content"
+        url = "https://example.com"
+        expected = "[https://example.com](https://example.com)\n\nSome markdown content"
+        assert converter.add_url_header(content, url) == expected
+    
+    def test_filter_markdown_integration(self, converter):
+        """Integration test for filter_markdown"""
+        sample_md = """
+[ Skip to main content ](https://pmc.ncbi.nlm.nih.gov/articles/PMC9919988/<#main-content>)
+![](https://pmc.ncbi.nlm.nih.gov/static/img/us_flag.svg)
+
+## PERMALINK
+Copy
+
+# Main Title Here
+
+### Author Name
+[Author](https://example.com/<https://actual.com/path>)
+"""
+        url = "https://example.com"
+        expected = ("[https://example.com](https://example.com)\n\n"
+                    "# Main Title Here\n\n"
+                    "### Author Name\n"
+                    "[Author](https://actual.com/path)")
+        result = converter.filter_markdown(sample_md)
+        result = converter.add_url_header(result, url)
+        assert result.strip() == expected.strip()
+

+ 16 - 15
worker/celery/html_convert_tasks.py

@@ -3,6 +3,7 @@ from worker.html_convert.pandoc import process_single_example, process_all_resul
 from mylib.logu import get_logger
 from mylib.logu import get_logger
 from worker.search_engine.search_result_db import SearchResultItem, SearchResultManager
 from worker.search_engine.search_result_db import SearchResultItem, SearchResultManager
 from sqlmodel import Session, select
 from sqlmodel import Session, select
+from worker.search_engine.valid_google_search import ValidSearchResult
 
 
 logger = get_logger('pandoc_tasks')
 logger = get_logger('pandoc_tasks')
 
 
@@ -44,21 +45,20 @@ def convert_all_results_task():
         return {"status": "failed", "error": str(e)}
         return {"status": "failed", "error": str(e)}
 
 
 def test_task_process_all_results():
 def test_task_process_all_results():
-    # Process all results in the database
-    db_manager = SearchResultManager()
-    with Session(db_manager.engine) as session:
-        # Fetch all IDs with explicit ordering
-        result_ids = session.exec(select(SearchResultItem.id, SearchResultItem.html_path).order_by(SearchResultItem.id)).all()
-        logger.info(f"Total results: {len(result_ids)}")
-        logger.info(f"First 5 result IDs: {result_ids[:5]}")
-        
-        for result_id, html_path in result_ids:
-            try:
-                if html_path.endswith('.html'):
-                    logger.info(f"Submitting task for SearchResultItem ID: {result_id}")
-                    convert_single_result_task.delay(result_id)
-            except Exception as e:
-                logger.error(f"Error processing result {result_id}: {e}")
+    # Process all valid results using ValidSearchResult
+    valid_search = ValidSearchResult()
+    valid_items = valid_search.get_valid_search_result_items()
+    
+    logger.info(f"Total valid results: {len(valid_items)}")
+    logger.info(f"First 5 valid result IDs: {[item.id for item in valid_items[:5]]}")
+    
+    for item in valid_items:
+        try:
+            if item.html_path and item.html_path.endswith('.html'):
+                logger.info(f"Submitting task for valid SearchResultItem ID: {item.id}")
+                convert_single_result_task.delay(item.id)
+        except Exception as e:
+            logger.error(f"Error processing valid result {item.id}: {e}")
 
 
 def clear_existing_tasks():
 def clear_existing_tasks():
     """清除所有待处理的任务"""
     """清除所有待处理的任务"""
@@ -71,6 +71,7 @@ def clear_existing_tasks():
 def main():
 def main():
     test_task_process_all_results()
     test_task_process_all_results()
     # clear_existing_tasks()
     # clear_existing_tasks()
+    pass
 
 
 if __name__ == "__main__":
 if __name__ == "__main__":
     main()
     main()

+ 102 - 5
worker/html_convert/converter_base.py

@@ -36,7 +36,30 @@ class ConverterBase:
         return convert_dir
         return convert_dir
     
     
     def extract_content_after_first_h1(self, content: str) -> str:
     def extract_content_after_first_h1(self, content: str) -> str:
-        """Extract content starting from the first H1 heading"""
+        """
+        Extract content starting from the first H1 heading.
+        This removes any content before the first H1 tag.
+        
+        Example:
+        Input:
+        ```
+        Some header content
+        
+        ## Subtitle
+        More content
+        
+        # First Main Title
+        
+        Actual content starts here
+        ```
+        
+        Output:
+        ```
+        # First Main Title
+        
+        Actual content starts here
+        ```
+        """
         h1_pattern = r'^# .+$'
         h1_pattern = r'^# .+$'
         match = re.search(h1_pattern, content, re.MULTILINE)
         match = re.search(h1_pattern, content, re.MULTILINE)
         if match:
         if match:
@@ -44,14 +67,41 @@ class ConverterBase:
         return content
         return content
     
     
     def fix_inline_links(self, content: str) -> str:
     def fix_inline_links(self, content: str) -> str:
-        """Fix inline links by handling the special URL patterns"""
-        link_pattern = r'\[([^\]]+)\]\(([^<]*)<([^>]*)>\)'
+        """
+        Fix inline links by handling special URL patterns.
+        This method processes markdown links in the format:
+        [text](domain<url>) and converts them to [text](url).
+        
+        Handles three cases:
+        1. If URL is relative, it combines with domain
+        2. If URL is absolute, it uses the URL directly
+        3. If link has empty text but contains <> pattern
+        
+        Examples:
+        1. [Author](https://example.com/<https://actual.com/path>)
+           => [Author](https://actual.com/path)
+        
+        2. [Link](https://domain.com/<relative/path>)
+           => [Link](https://domain.com/relative/path)
+           
+        3. ![](image.png) [](https://domain.com/<#anchor>)
+           => ![](image.png) [](https://domain.com/#anchor)
+        """
+        link_pattern = r'\[([^\]]*)\]\(([^<]*)<([^>]*)>\)'
         
         
         def replace_link(match):
         def replace_link(match):
             text = match.group(1)
             text = match.group(1)
             domain = match.group(2)
             domain = match.group(2)
             url = match.group(3)
             url = match.group(3)
             
             
+            if not text and url.startswith('#'):
+                # Handle empty text with anchor links
+                if domain:
+                    parsed_domain = urlparse(domain)
+                    base_url = f"{parsed_domain.scheme}://{parsed_domain.netloc}{parsed_domain.path}"
+                    return f'[]({base_url}{url})'
+                return f'[]({url})'
+            
             if url.startswith('/'):
             if url.startswith('/'):
                 if domain:
                 if domain:
                     parsed_domain = urlparse(domain)
                     parsed_domain = urlparse(domain)
@@ -64,11 +114,58 @@ class ConverterBase:
         return re.sub(link_pattern, replace_link, content)
         return re.sub(link_pattern, replace_link, content)
     
     
     def add_url_header(self, content: str, url: str) -> str:
     def add_url_header(self, content: str, url: str) -> str:
-        """Add URL as a header at the top of the content"""
+        """
+        Add URL as a header at the top of the content.
+        The URL is added in markdown link format:
+        [URL](URL)
+        
+        Example:
+        Input:
+        ```
+        Some content
+        ```
+        
+        With URL: https://example.com
+        
+        Output:
+        ```
+        [https://example.com](https://example.com)
+        
+        Some content
+        ```
+        """
         return f"[{url}]({url})\n\n{content}"
         return f"[{url}]({url})\n\n{content}"
     
     
     def filter_markdown(self, content: str) -> str:
     def filter_markdown(self, content: str) -> str:
-        """Filter markdown content according to specified rules"""
+        """
+        Filter markdown content according to specified rules:
+        1. Remove content before first H1
+        2. Fix inline links
+        3. (URL header is added separately)
+        
+        Example:
+        Input:
+        ```
+        [ Skip to main content ](https://pmc.ncbi.nlm.nih.gov/articles/PMC9919988/<#main-content>)
+        ![](https://pmc.ncbi.nlm.nih.gov/static/img/us_flag.svg)
+        
+        ## PERMALINK
+        Copy
+        
+        # Main Title Here
+        
+        ### Author Name
+        [Author](https://example.com/<https://actual.com/path>)
+        ```
+        
+        Output:
+        ```
+        # Main Title Here
+        
+        ### Author Name
+        [Author](https://actual.com/path)
+        ```
+        """
         content = self.extract_content_after_first_h1(content)
         content = self.extract_content_after_first_h1(content)
         logger.info(f"extract_content_after_first_h1: {content[:300]}")
         logger.info(f"extract_content_after_first_h1: {content[:300]}")
         content = self.fix_inline_links(content)
         content = self.fix_inline_links(content)

+ 171 - 0
worker/html_convert/export_files/copy_files.py

@@ -0,0 +1,171 @@
+from pathlib import Path
+import shutil
+
+
+def list_files(directory) -> list[Path]:
+    """
+    遍历指定目录中的所有文件,返回文件路径列表。
+    :param directory: 要遍历的目录路径
+    :return: 文件路径列表
+    """
+    directory = Path(directory).resolve()
+    file_paths = []
+    for file in directory.rglob("*"):
+        if file.is_file():
+            file_paths.append(file)
+    return file_paths
+
+
+def parse_file_path(file_path):
+    """
+    解析文件路径,提取 {keyword}、{id} 和文件类型(docling 或 filtered)。
+    :param file_path: 文件路径
+    :return: (keyword, id, file_type, extension) 或 None
+    """
+    # 获取相对路径
+    file_path = Path(file_path).resolve()
+    rel_path = file_path.relative_to(Path("output/copy_exported_files").resolve())
+    parts = rel_path.parts
+    if len(parts) < 2:
+        raise ValueError(f"Invalid file path: {file_path}")
+    
+    keyword = parts[0]
+    filename = parts[-1]
+    name = Path(filename).stem
+    ext = Path(filename).suffix
+    
+    # 检查是否为PDF文件
+    if ext.lower() == ".pdf":
+        return keyword, name, "pdf", ext
+    
+    try:
+        id_part, file_type = name.rsplit("_", 1)
+    except ValueError:
+        print(f"Skipping invalid filename format: {filename}")
+        return None
+    
+    return keyword, id_part, file_type, ext
+
+
+def restore_files(source_dir, target_dir, dry_run=False):
+    """
+    将 source_dir 中的文件恢复到 target_dir,按照目标目录结构组织。
+    :param source_dir: 源目录路径
+    :param target_dir: 目标目录路径
+    :param dry_run: 是否为 dry run 模式(仅打印操作而不实际复制)
+    """
+    # 列出所有文件
+    file_paths = list_files(source_dir)
+    
+    for file_path in file_paths:
+        try:
+            # 检查文件扩展名
+            if file_path.suffix.lower() not in [".md", ".docx", ".pdf"]:
+                print(f"Skipping non-target file type: {file_path}")
+                continue
+            
+            # 解析文件路径
+            parsed = parse_file_path(file_path)
+            if not parsed:
+                continue
+            
+            keyword, id_part, file_type, ext = parsed
+            
+            # 构造目标路径
+            if file_type == "pdf":
+                target_subdir = Path(target_dir) / keyword / "crawled_urls"
+                target_file = target_subdir / f"{id_part}{ext}"
+            else:
+                target_subdir = Path(target_dir) / keyword / "html_convert"
+                target_file = target_subdir / f"{id_part}_{file_type}{ext}"
+            
+            target_subdir.mkdir(parents=True, exist_ok=True)
+            
+            # 检查目标文件是否存在
+            target_file_path = Path(target_file)
+            if target_file_path.exists():
+                print(f"File already exists, skipping: {target_file}")
+                continue
+            
+            # dry run 模式
+            if dry_run:
+                print(f"[Dry Run] Would copy: {file_path} -> {target_file.absolute()}")
+                continue
+            
+            # 复制文件
+            shutil.copy(file_path, target_file)
+            print(f"Copied: {file_path} -> {target_file}")
+            
+        except Exception as e:
+            print(f"Error processing {file_path}: {e}")
+
+
+def reverse_restore_files(source_dir, target_dir, dry_run=False):
+    """
+    将 source_dir (results) 中的文件复制回 target_dir (copy_exported_files)
+    :param source_dir: 源目录路径 (results)
+    :param target_dir: 目标目录路径 (copy_exported_files)
+    :param dry_run: 是否为 dry run 模式(默认开启)
+    """
+    # 列出所有文件
+    file_paths = list_files(source_dir)
+    
+    for file_path in file_paths[20000:]:
+        try:
+            # 检查文件扩展名
+            if file_path.suffix.lower() not in [".md", ".docx", ".pdf"]:
+                # print(f"Skipping non-target file type: {file_path}")
+                continue
+            
+            # 获取相对路径
+            rel_path = file_path.relative_to(Path(source_dir).resolve())
+            parts = rel_path.parts
+            
+            if len(parts) < 3:
+                print(f"Skipping invalid path: {file_path}")
+                continue
+            
+            keyword = parts[0]
+            folder_type = parts[1]
+            filename = parts[-1]
+            
+            if folder_type == "html_convert":
+                # 处理 html_convert 文件
+                target_file = Path(target_dir) / keyword / filename
+            elif folder_type == "crawled_urls":
+                # 处理 PDF 文件
+                target_file = Path(target_dir) / keyword / filename
+            else:
+                print(f"Unknown folder type: {folder_type}")
+                continue
+            
+            # 检查目标文件是否存在
+            if target_file.exists():
+                print(f"File already exists, skipping: {target_file}")
+                continue
+            
+            # dry run 模式
+            if dry_run:
+                print(f"[Dry Run] Would copy: {file_path} -> {target_file}")
+                continue
+            
+            # 创建目标目录
+            target_file.parent.mkdir(parents=True, exist_ok=True)
+            
+            # 复制文件
+            shutil.copy(file_path, target_file)
+            print(f"Copied: {file_path} -> {target_file.absolute()}")
+            
+        except Exception as e:
+            print(f"Error processing {file_path}: {e}")
+
+
+if __name__ == "__main__":
+    # source_directory = "output/copy_exported_files"
+    # target_directory = "output/results"
+    # restore_files(source_directory, target_directory)
+    
+    # Reverse copy
+    reverse_source_directory = "output/results"
+    reverse_target_directory = "output/copy_exported_files"
+    reverse_restore_files(reverse_source_directory, reverse_target_directory)

+ 0 - 0
worker/html_convert/export_files/export_results.py


+ 16 - 12
worker/html_convert/pandoc.py

@@ -51,7 +51,7 @@ class PandocConverter:
             
             
             cmd = [
             cmd = [
                 PANDOC_EXE,
                 PANDOC_EXE,
-                '-f', 'markdown+yaml_metadata_block',
+                '-f', 'markdown+pipe_tables+simple_tables+multiline_tables',
                 '-t', 'docx',
                 '-t', 'docx',
                 '--reference-doc', self._get_reference_doc(),
                 '--reference-doc', self._get_reference_doc(),
                 '-o', str(output_path),
                 '-o', str(output_path),
@@ -61,7 +61,7 @@ class PandocConverter:
             if self.include_toc:
             if self.include_toc:
                 # Specify heading levels for TOC
                 # Specify heading levels for TOC
                 cmd.insert(-1, '--toc')
                 cmd.insert(-1, '--toc')
-                cmd.insert(-1, '--toc-depth=3')  # Include up to level 3 headings
+                cmd.insert(-1, '--toc-depth=2')  # Include up to level 3 headings
             
             
             # Add verbose flag to capture more information about resource fetching
             # Add verbose flag to capture more information about resource fetching
             cmd.append('--verbose')
             cmd.append('--verbose')
@@ -99,7 +99,7 @@ class PandocConverter:
                 return False
                 return False
             
             
             # Get the HTML convert result
             # Get the HTML convert result
-            html_convert = session.exec(
+            html_convert:HtmlConvertResult = session.exec(
                 select(HtmlConvertResult)
                 select(HtmlConvertResult)
                 .where(HtmlConvertResult.search_result_item_id == result_id)
                 .where(HtmlConvertResult.search_result_item_id == result_id)
             ).first()
             ).first()
@@ -138,7 +138,7 @@ class PandocConverter:
                     logger.info(f"Skipping already converted filtered markdown: {filtered_docx_path}")
                     logger.info(f"Skipping already converted filtered markdown: {filtered_docx_path}")
                     filtered_success = True
                     filtered_success = True
                 else:
                 else:
-                    filtered_success = self.convert_md_to_docx(filtered_md_path, filtered_docx_path)
+                    # filtered_success = self.convert_md_to_docx(filtered_md_path, filtered_docx_path)
                     if filtered_success:
                     if filtered_success:
                         html_convert.pandoc_docx_path = str(filtered_docx_path)
                         html_convert.pandoc_docx_path = str(filtered_docx_path)
                         html_convert.is_pandoc_converted = True
                         html_convert.is_pandoc_converted = True
@@ -174,21 +174,21 @@ class PandocConverter:
         normal_style.font.size = Pt(12)
         normal_style.font.size = Pt(12)
         doc.save(str(reference_doc))
         doc.save(str(reference_doc))
 
 
-def process_single_example(result_id: int):
+def process_single_example(result_id: int, skip_existing=True):
     # Process a single result example
     # Process a single result example
     docling_converter = DoclingConverter()
     docling_converter = DoclingConverter()
     search_result_item = docling_converter.get_search_result_item(result_id)
     search_result_item = docling_converter.get_search_result_item(result_id)
     if search_result_item.html_path.endswith('.html'):
     if search_result_item.html_path.endswith('.html'):
-        docling_converter.process_conversion_by_id(result_id)
+        docling_converter.process_conversion_by_id(result_id, skip_existing=skip_existing)
     
     
     crawl_filter = CrawlFilter()
     crawl_filter = CrawlFilter()
-    crawl_filter.process_filter_by_id(result_id)
+    crawl_filter.process_filter_by_id(result_id, skip_existing=skip_existing)
     
     
     pandoc_converter = PandocConverter(font_name="宋体", include_toc=True)
     pandoc_converter = PandocConverter(font_name="宋体", include_toc=True)
-    success = pandoc_converter.process_single_result(result_id, skip_existing=True)
+    logger.info(f"skip_existing {skip_existing}")
+    success = pandoc_converter.process_single_result(result_id, skip_existing=skip_existing)
     if success:
     if success:
         logger.info(f"Successfully processed result {result_id}")
         logger.info(f"Successfully processed result {result_id}")
-        logger.info("Note: You may need to manually update the Table of Contents in Word after opening the document.")
     else:
     else:
         logger.error(f"Failed to process result {result_id}")
         logger.error(f"Failed to process result {result_id}")
 
 
@@ -209,6 +209,10 @@ def process_all_results():
                 logger.error(f"Error processing result {result_id}: {e}")
                 logger.error(f"Error processing result {result_id}: {e}")
 
 
 if __name__ == "__main__":
 if __name__ == "__main__":
-    # Example usage
-    # process_single_example(6)
-    process_all_results()
+    # 计算运行时间
+    import time
+    start_time = time.time()
+    process_single_example(996, skip_existing=False)
+    end_time = time.time()
+    print(f"Total time: {end_time - start_time} seconds")
+    # process_all_results()

+ 23 - 1
worker/search_engine/valid_google_search.py

@@ -1,9 +1,11 @@
+import json
+import os
 import time
 import time
 import re
 import re
 import logging
 import logging
 from pathlib import Path
 from pathlib import Path
 from typing import Dict, Optional, List
 from typing import Dict, Optional, List
-from DrissionPage import ChromiumPage
+from DrissionPage import ChromiumOptions, ChromiumPage
 from pydantic import BaseModel
 from pydantic import BaseModel
 from scrapling import Adaptor
 from scrapling import Adaptor
 from sqlmodel import Session, select
 from sqlmodel import Session, select
@@ -122,6 +124,26 @@ class ValidSearchResult:
         # page.quit()
         # page.quit()
         return result_item
         return result_item
         
         
+    def load_dp_page(self, proxy=None, no_imgs=False):
+        chrome_options = ChromiumOptions()
+        if proxy:
+            chrome_options.set_proxy(proxy)
+        # 如果存在代理环境变量
+        elif 'HTTP_PROXY' in os.environ:
+            chrome_options.set_proxy(os.environ['HTTP_PROXY'])
+        chrome_options.auto_port(True)
+        chrome_options.no_imgs(no_imgs)
+        
+        logger.info(f"proxy {proxy}")
+        page = ChromiumPage(chrome_options)
+        tab = page.latest_tab
+        tab.set.cookies
+        return page
+
+    def load_cookies(self):
+        path = Path(r'G:\code\upwork\zhang_crawl_bio\CF-Clearance-Scraper\cookies.json')
+        export_cookies = json.loads(path.read_text(encoding='utf-8'))
+        export_cookies.get('cookies')
 
 
 def main():
 def main():
     vsr = ValidSearchResult()
     vsr = ValidSearchResult()