9 months ago · 1835feda6a
--- a/.gitignore
+++ b/.gitignore
@@ -7,4 +7,6 @@ __pycache__
 
															 .pytest_cache
														
 
															 download
														
 
															 local_proxy_pool/
														
 
															-*.rdb
														
 
															+*.rdb
														
 
															+.env
														
 
															+CF-Clearance-Scraper
														
--- a/architecture.md
+++ b/architecture.md
@@ -0,0 +1,81 @@
 
															+
														
 
															+
														
 
															+# 测试
														
 
															+```shell
														
 
															+C:\Users\mg\.local\bin\aider.exe
														
 
															+
														
 
															+python -m pytest tests/test_google_search.py -v
														
 
															+```
														
 
															+
														
 
															+# 工具
														
 
															+
														
 
															+## 架构相关
														
 
															+3.3k ⭐  智能打开 s3 、hdfs 、 sftp 、 ftp 、 local 文件系统
														
 
															+https://github.com/piskvorky/smart_open
														
 
															+
														
 
															+2.2k ⭐  PyFilesystem2 是一个抽象的文件系统接口
														
 
															+https://github.com/PyFilesystem/pyfilesystem2
														
 
															+
														
 
															+
														
 
															+## 爬虫和解析
														
 
															+6.6k ⭐  爬虫框架大全
														
 
															+https://github.com/BruceDone/awesome-crawler
														
 
															+
														
 
															+爬虫工具大全，搜索： 爬
														
 
															+https://github.com/GitHubDaily/GitHubDaily/blob/cb618c17a72fc5a62248e5ac863d46fe0164487b/README.md?plain=1#L190
														
 
															+
														
 
															+
														
 
															+330 ⭐  awesome 网页解析器数据提取大全
														
 
															+https://github.com/kimtth/awesome-azure-openai-llm/blob/9b16663bb4e38bc8760f3f274b92dfcca0ada34a/section/app.md
														
 
															+关键词： https://github.com/search?q=Trafilatura+awesome++language%3AMarkdown&type=code&l=Markdown
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+34.9k ⭐ markitdown
														
 
															+https://github.com/microsoft/markitdown
														
 
															+
														
 
															+22k ⭐ firecrawl AI 抓取干净结构化的页面 
														
 
															+https://github.com/mendableai/firecrawl
														
 
															+
														
 
															+17.3k ⭐  python 用AI自动抓取网页信息，自动解析 markdown ，自定义提取的字段
														
 
															+还能生成代码，为页面固定运行代码
														
 
															+https://github.com/ScrapeGraphAI/Scrapegraph-ai/blob/main/README.md
														
 
															+
														
 
															+7.5K ⭐  无代码，鼠标点击元素，即可提取所有相似数据转换成表格或者 json ， 或者 API 接口
														
 
															+https://github.com/getmaxun/maxun
														
 
															+
														
 
															+241 ⭐  在代码中用自然语言描述，就能让AI提取有关数据，还能转换为 markdown ，也能用自然语言描述来自动化输入、回车、提交
														
 
															+需要 LLM 秘钥
														
 
															+https://github.com/dendrite-systems/dendrite-python-sdk
														
 
															+
														
 
															+readerLM-v2
														
 
															+
														
 
															+3.1k⭐ 网页解析器
														
 
															+https://github.com/adbar/trafilatura
														
 
															+各类工具的评估结果，我们可以看看哪些工具最强
														
 
															+https://trafilatura.readthedocs.io/en/latest/evaluation.html#results-2022-05-18
														
 
															+
														
 
															+
														
 
															+339 ⭐ 文章提取器，这是一个论文和评估基准
														
 
															+https://github.com/scrapinghub/article-extraction-benchmark
														
 
															+
														
 
															+1.4k ⭐  快如闪电的解析器，比 bs4 快240倍 ，
														
 
															+可以进行相似元素的搜索，加快搜索效率，智能导航，可以快速跳转到父级、子级、兄弟元素
														
 
															+假如元素属性发生改变，它可以智能识别改变后的元素
														
 
															+https://github.com/D4Vinci/Scrapling
														
 
															+
														
 
															+### 反机器人检测的浏览器
														
 
															+365 ⭐  给原装的playwright打补丁，能够避免检测到自动化
														
 
															+https://github.com/rebrowser/rebrowser-playwright-python
														
 
															+
														
 
															+0.98k ⭐  反机器人检测的浏览器
														
 
															+https://github.com/daijro/camoufox
														
 
															+浏览器启动信息： about:support
														
 
															+
														
 
															+机器人检测的网站，测试用
														
 
															+https://www.browserscan.net/bot-detection
														
 
															+"https://bot.sannysoft.com/"
														
 
															+
														
 
															+检查你的 代理ip 和浏览器指纹真伪
														
 
															+https://www.browserscan.net
														
--- a/tests/test_converter_base.py
+++ b/tests/test_converter_base.py
@@ -0,0 +1,79 @@
 
															+import pytest
														
 
															+from pathlib import Path
														
 
															+from worker.html_convert.converter_base import ConverterBase
														
 
															+
														
 
															+class TestConverterBase:
														
 
															+    """Test suite for ConverterBase class"""
														
 
															+    
														
 
															+    @pytest.fixture
														
 
															+    def converter(self):
														
 
															+        return ConverterBase()
														
 
															+    
														
 
															+    def test_extract_content_after_first_h1(self, converter):
														
 
															+        """Test extracting content after first H1"""
														
 
															+        sample_md = """
														
 
															+Some header content to skip
														
 
															+
														
 
															+## PERMALINK
														
 
															+Copy
														
 
															+
														
 
															+# Main Title Here
														
 
															+
														
 
															+Content starts here
														
 
															+"""
														
 
															+        expected = "# Main Title Here\n\nContent starts here"
														
 
															+        result = converter.extract_content_after_first_h1(sample_md)
														
 
															+        assert result.strip() == expected.strip()
														
 
															+        
														
 
															+    def test_fix_inline_links(self, converter):
														
 
															+        """Test fixing inline links"""
														
 
															+        # Test case 1: Relative URL with domain
														
 
															+        sample_md_1 = "[Author Name](https://example.com/<https://actual.com/path>)"
														
 
															+        expected_1 = "[Author Name](https://actual.com/path)"
														
 
															+        assert converter.fix_inline_links(sample_md_1) == expected_1
														
 
															+        
														
 
															+        # Test case 2: Absolute URL
														
 
															+        sample_md_2 = "[PMC Copyright](https://pmc.ncbi.nlm.nih.gov/articles/PMC9919988/</about/copyright/>)"
														
 
															+        expected_2 = "[PMC Copyright](https://pmc.ncbi.nlm.nih.gov/articles/PMC9919988/about/copyright/)"
														
 
															+        assert converter.fix_inline_links(sample_md_2) == expected_2
														
 
															+        
														
 
															+        # Test case 3: Already correct link
														
 
															+        sample_md_3 = "[Normal Link](https://correct.com/path)"
														
 
															+        expected_3 = "[Normal Link](https://correct.com/path)"
														
 
															+        assert converter.fix_inline_links(sample_md_3) == expected_3
														
 
															+        
														
 
															+        # Test case 4: Image link with empty alt text
														
 
															+        sample_md_4 = "![](https://pub.mdpi-res.com/img/table.png) [](https://www.mdpi.com/1420-3049/29/22/<#table_body_display_molecules-29-05310-t003>)"
														
 
															+        expected_4 = "![](https://pub.mdpi-res.com/img/table.png) [](https://www.mdpi.com/1420-3049/29/22/#table_body_display_molecules-29-05310-t003)"
														
 
															+        assert converter.fix_inline_links(sample_md_4) == expected_4
														
 
															+    
														
 
															+    def test_add_url_header(self, converter):
														
 
															+        """Test adding URL header"""
														
 
															+        content = "Some markdown content"
														
 
															+        url = "https://example.com"
														
 
															+        expected = "[https://example.com](https://example.com)\n\nSome markdown content"
														
 
															+        assert converter.add_url_header(content, url) == expected
														
 
															+    
														
 
															+    def test_filter_markdown_integration(self, converter):
														
 
															+        """Integration test for filter_markdown"""
														
 
															+        sample_md = """
														
 
															+[ Skip to main content ](https://pmc.ncbi.nlm.nih.gov/articles/PMC9919988/<#main-content>)
														
 
															+![](https://pmc.ncbi.nlm.nih.gov/static/img/us_flag.svg)
														
 
															+
														
 
															+## PERMALINK
														
 
															+Copy
														
 
															+
														
 
															+# Main Title Here
														
 
															+
														
 
															+### Author Name
														
 
															+[Author](https://example.com/<https://actual.com/path>)
														
 
															+"""
														
 
															+        url = "https://example.com"
														
 
															+        expected = ("[https://example.com](https://example.com)\n\n"
														
 
															+                    "# Main Title Here\n\n"
														
 
															+                    "### Author Name\n"
														
 
															+                    "[Author](https://actual.com/path)")
														
 
															+        result = converter.filter_markdown(sample_md)
														
 
															+        result = converter.add_url_header(result, url)
														
 
															+        assert result.strip() == expected.strip()
														
 
															+
														
--- a/worker/celery/html_convert_tasks.py
+++ b/worker/celery/html_convert_tasks.py
@@ -3,6 +3,7 @@ from worker.html_convert.pandoc import process_single_example, process_all_resul
 
															 from mylib.logu import get_logger
														
 
															 from worker.search_engine.search_result_db import SearchResultItem, SearchResultManager
														
 
															 from sqlmodel import Session, select
														
 
															+from worker.search_engine.valid_google_search import ValidSearchResult
														
 
															 logger = get_logger('pandoc_tasks')
														
@@ -44,21 +45,20 @@ def convert_all_results_task():
 
															         return {"status": "failed", "error": str(e)}
														
 
															 def test_task_process_all_results():
														
 
															-    # Process all results in the database
														
 
															-    db_manager = SearchResultManager()
														
 
															-    with Session(db_manager.engine) as session:
														
 
															-        # Fetch all IDs with explicit ordering
														
 
															-        result_ids = session.exec(select(SearchResultItem.id, SearchResultItem.html_path).order_by(SearchResultItem.id)).all()
														
 
															-        logger.info(f"Total results: {len(result_ids)}")
														
 
															-        logger.info(f"First 5 result IDs: {result_ids[:5]}")
														
 
															-        
														
 
															-        for result_id, html_path in result_ids:
														
 
															-            try:
														
 
															-                if html_path.endswith('.html'):
														
 
															-                    logger.info(f"Submitting task for SearchResultItem ID: {result_id}")
														
 
															-                    convert_single_result_task.delay(result_id)
														
 
															-            except Exception as e:
														
 
															-                logger.error(f"Error processing result {result_id}: {e}")
														
 
															+    # Process all valid results using ValidSearchResult
														
 
															+    valid_search = ValidSearchResult()
														
 
															+    valid_items = valid_search.get_valid_search_result_items()
														
 
															+    
														
 
															+    logger.info(f"Total valid results: {len(valid_items)}")
														
 
															+    logger.info(f"First 5 valid result IDs: {[item.id for item in valid_items[:5]]}")
														
 
															+    
														
 
															+    for item in valid_items:
														
 
															+        try:
														
 
															+            if item.html_path and item.html_path.endswith('.html'):
														
 
															+                logger.info(f"Submitting task for valid SearchResultItem ID: {item.id}")
														
 
															+                convert_single_result_task.delay(item.id)
														
 
															+        except Exception as e:
														
 
															+            logger.error(f"Error processing valid result {item.id}: {e}")
														
 
															 def clear_existing_tasks():
														
 
															     """清除所有待处理的任务"""
														
@@ -71,6 +71,7 @@ def clear_existing_tasks():
 
															 def main():
														
 
															     test_task_process_all_results()
														
 
															     # clear_existing_tasks()
														
 
															+    pass
														
 
															 if __name__ == "__main__":
														
 
															     main()
														
--- a/worker/html_convert/converter_base.py
+++ b/worker/html_convert/converter_base.py
@@ -36,7 +36,30 @@ class ConverterBase:
 
															         return convert_dir
														
 
															     def extract_content_after_first_h1(self, content: str) -> str:
														
 
															-        """Extract content starting from the first H1 heading"""
														
 
															+        """
														
 
															+        Extract content starting from the first H1 heading.
														
 
															+        This removes any content before the first H1 tag.
														
 
															+        
														
 
															+        Example:
														
 
															+        Input:
														
 
															+        ```
														
 
															+        Some header content
														
 
															+        
														
 
															+        ## Subtitle
														
 
															+        More content
														
 
															+        
														
 
															+        # First Main Title
														
 
															+        
														
 
															+        Actual content starts here
														
 
															+        ```
														
 
															+        
														
 
															+        Output:
														
 
															+        ```
														
 
															+        # First Main Title
														
 
															+        
														
 
															+        Actual content starts here
														
 
															+        ```
														
 
															+        """
														
 
															         h1_pattern = r'^# .+$'
														
 
															         match = re.search(h1_pattern, content, re.MULTILINE)
														
 
															         if match:
														
@@ -44,14 +67,41 @@ class ConverterBase:
 
															         return content
														
 
															     def fix_inline_links(self, content: str) -> str:
														
 
															-        """Fix inline links by handling the special URL patterns"""
														
 
															-        link_pattern = r'\[([^\]]+)\]\(([^<]*)<([^>]*)>\)'
														
 
															+        """
														
 
															+        Fix inline links by handling special URL patterns.
														
 
															+        This method processes markdown links in the format:
														
 
															+        [text](domain<url>) and converts them to [text](url).
														
 
															+        
														
 
															+        Handles three cases:
														
 
															+        1. If URL is relative, it combines with domain
														
 
															+        2. If URL is absolute, it uses the URL directly
														
 
															+        3. If link has empty text but contains <> pattern
														
 
															+        
														
 
															+        Examples:
														
 
															+        1. [Author](https://example.com/<https://actual.com/path>)
														
 
															+           => [Author](https://actual.com/path)
														
 
															+        
														
 
															+        2. [Link](https://domain.com/<relative/path>)
														
 
															+           => [Link](https://domain.com/relative/path)
														
 
															+           
														
 
															+        3. ![](image.png) [](https://domain.com/<#anchor>)
														
 
															+           => ![](image.png) [](https://domain.com/#anchor)
														
 
															+        """
														
 
															+        link_pattern = r'\[([^\]]*)\]\(([^<]*)<([^>]*)>\)'
														
 
															         def replace_link(match):
														
 
															             text = match.group(1)
														
 
															             domain = match.group(2)
														
 
															             url = match.group(3)
														
 
															+            if not text and url.startswith('#'):
														
 
															+                # Handle empty text with anchor links
														
 
															+                if domain:
														
 
															+                    parsed_domain = urlparse(domain)
														
 
															+                    base_url = f"{parsed_domain.scheme}://{parsed_domain.netloc}{parsed_domain.path}"
														
 
															+                    return f'[]({base_url}{url})'
														
 
															+                return f'[]({url})'
														
 
															+            
														
 
															             if url.startswith('/'):
														
 
															                 if domain:
														
 
															                     parsed_domain = urlparse(domain)
														
@@ -64,11 +114,58 @@ class ConverterBase:
 
															         return re.sub(link_pattern, replace_link, content)
														
 
															     def add_url_header(self, content: str, url: str) -> str:
														
 
															-        """Add URL as a header at the top of the content"""
														
 
															+        """
														
 
															+        Add URL as a header at the top of the content.
														
 
															+        The URL is added in markdown link format:
														
 
															+        [URL](URL)
														
 
															+        
														
 
															+        Example:
														
 
															+        Input:
														
 
															+        ```
														
 
															+        Some content
														
 
															+        ```
														
 
															+        
														
 
															+        With URL: https://example.com
														
 
															+        
														
 
															+        Output:
														
 
															+        ```
														
 
															+        [https://example.com](https://example.com)
														
 
															+        
														
 
															+        Some content
														
 
															+        ```
														
 
															+        """
														
 
															         return f"[{url}]({url})\n\n{content}"
														
 
															     def filter_markdown(self, content: str) -> str:
														
 
															-        """Filter markdown content according to specified rules"""
														
 
															+        """
														
 
															+        Filter markdown content according to specified rules:
														
 
															+        1. Remove content before first H1
														
 
															+        2. Fix inline links
														
 
															+        3. (URL header is added separately)
														
 
															+        
														
 
															+        Example:
														
 
															+        Input:
														
 
															+        ```
														
 
															+        [ Skip to main content ](https://pmc.ncbi.nlm.nih.gov/articles/PMC9919988/<#main-content>)
														
 
															+        ![](https://pmc.ncbi.nlm.nih.gov/static/img/us_flag.svg)
														
 
															+        
														
 
															+        ## PERMALINK
														
 
															+        Copy
														
 
															+        
														
 
															+        # Main Title Here
														
 
															+        
														
 
															+        ### Author Name
														
 
															+        [Author](https://example.com/<https://actual.com/path>)
														
 
															+        ```
														
 
															+        
														
 
															+        Output:
														
 
															+        ```
														
 
															+        # Main Title Here
														
 
															+        
														
 
															+        ### Author Name
														
 
															+        [Author](https://actual.com/path)
														
 
															+        ```
														
 
															+        """
														
 
															         content = self.extract_content_after_first_h1(content)
														
 
															         logger.info(f"extract_content_after_first_h1: {content[:300]}")
														
 
															         content = self.fix_inline_links(content)
														
--- a/worker/html_convert/export_files/copy_files.py
+++ b/worker/html_convert/export_files/copy_files.py
@@ -0,0 +1,171 @@
 
															+from pathlib import Path
														
 
															+import shutil
														
 
															+
														
 
															+
														
 
															+def list_files(directory) -> list[Path]:
														
 
															+    """
														
 
															+    遍历指定目录中的所有文件，返回文件路径列表。
														
 
															+    :param directory: 要遍历的目录路径
														
 
															+    :return: 文件路径列表
														
 
															+    """
														
 
															+    directory = Path(directory).resolve()
														
 
															+    file_paths = []
														
 
															+    for file in directory.rglob("*"):
														
 
															+        if file.is_file():
														
 
															+            file_paths.append(file)
														
 
															+    return file_paths
														
 
															+
														
 
															+
														
 
															+def parse_file_path(file_path):
														
 
															+    """
														
 
															+    解析文件路径，提取 {keyword}、{id} 和文件类型（docling 或 filtered）。
														
 
															+    :param file_path: 文件路径
														
 
															+    :return: (keyword, id, file_type, extension) 或 None
														
 
															+    """
														
 
															+    # 获取相对路径
														
 
															+    file_path = Path(file_path).resolve()
														
 
															+    rel_path = file_path.relative_to(Path("output/copy_exported_files").resolve())
														
 
															+    parts = rel_path.parts
														
 
															+    if len(parts) < 2:
														
 
															+        raise ValueError(f"Invalid file path: {file_path}")
														
 
															+    
														
 
															+    keyword = parts[0]
														
 
															+    filename = parts[-1]
														
 
															+    name = Path(filename).stem
														
 
															+    ext = Path(filename).suffix
														
 
															+    
														
 
															+    # 检查是否为PDF文件
														
 
															+    if ext.lower() == ".pdf":
														
 
															+        return keyword, name, "pdf", ext
														
 
															+    
														
 
															+    try:
														
 
															+        id_part, file_type = name.rsplit("_", 1)
														
 
															+    except ValueError:
														
 
															+        print(f"Skipping invalid filename format: {filename}")
														
 
															+        return None
														
 
															+    
														
 
															+    return keyword, id_part, file_type, ext
														
 
															+
														
 
															+
														
 
															+def restore_files(source_dir, target_dir, dry_run=False):
														
 
															+    """
														
 
															+    将 source_dir 中的文件恢复到 target_dir，按照目标目录结构组织。
														
 
															+    :param source_dir: 源目录路径
														
 
															+    :param target_dir: 目标目录路径
														
 
															+    :param dry_run: 是否为 dry run 模式（仅打印操作而不实际复制）
														
 
															+    """
														
 
															+    # 列出所有文件
														
 
															+    file_paths = list_files(source_dir)
														
 
															+    
														
 
															+    for file_path in file_paths:
														
 
															+        try:
														
 
															+            # 检查文件扩展名
														
 
															+            if file_path.suffix.lower() not in [".md", ".docx", ".pdf"]:
														
 
															+                print(f"Skipping non-target file type: {file_path}")
														
 
															+                continue
														
 
															+            
														
 
															+            # 解析文件路径
														
 
															+            parsed = parse_file_path(file_path)
														
 
															+            if not parsed:
														
 
															+                continue
														
 
															+            
														
 
															+            keyword, id_part, file_type, ext = parsed
														
 
															+            
														
 
															+            # 构造目标路径
														
 
															+            if file_type == "pdf":
														
 
															+                target_subdir = Path(target_dir) / keyword / "crawled_urls"
														
 
															+                target_file = target_subdir / f"{id_part}{ext}"
														
 
															+            else:
														
 
															+                target_subdir = Path(target_dir) / keyword / "html_convert"
														
 
															+                target_file = target_subdir / f"{id_part}_{file_type}{ext}"
														
 
															+            
														
 
															+            target_subdir.mkdir(parents=True, exist_ok=True)
														
 
															+            
														
 
															+            # 检查目标文件是否存在
														
 
															+            target_file_path = Path(target_file)
														
 
															+            if target_file_path.exists():
														
 
															+                print(f"File already exists, skipping: {target_file}")
														
 
															+                continue
														
 
															+            
														
 
															+            # dry run 模式
														
 
															+            if dry_run:
														
 
															+                print(f"[Dry Run] Would copy: {file_path} -> {target_file.absolute()}")
														
 
															+                continue
														
 
															+            
														
 
															+            # 复制文件
														
 
															+            shutil.copy(file_path, target_file)
														
 
															+            print(f"Copied: {file_path} -> {target_file}")
														
 
															+            
														
 
															+        except Exception as e:
														
 
															+            print(f"Error processing {file_path}: {e}")
														
 
															+
														
 
															+
														
 
															+def reverse_restore_files(source_dir, target_dir, dry_run=False):
														
 
															+    """
														
 
															+    将 source_dir (results) 中的文件复制回 target_dir (copy_exported_files)
														
 
															+    :param source_dir: 源目录路径 (results)
														
 
															+    :param target_dir: 目标目录路径 (copy_exported_files)
														
 
															+    :param dry_run: 是否为 dry run 模式（默认开启）
														
 
															+    """
														
 
															+    # 列出所有文件
														
 
															+    file_paths = list_files(source_dir)
														
 
															+    
														
 
															+    for file_path in file_paths[20000:]:
														
 
															+        try:
														
 
															+            # 检查文件扩展名
														
 
															+            if file_path.suffix.lower() not in [".md", ".docx", ".pdf"]:
														
 
															+                # print(f"Skipping non-target file type: {file_path}")
														
 
															+                continue
														
 
															+            
														
 
															+            # 获取相对路径
														
 
															+            rel_path = file_path.relative_to(Path(source_dir).resolve())
														
 
															+            parts = rel_path.parts
														
 
															+            
														
 
															+            if len(parts) < 3:
														
 
															+                print(f"Skipping invalid path: {file_path}")
														
 
															+                continue
														
 
															+            
														
 
															+            keyword = parts[0]
														
 
															+            folder_type = parts[1]
														
 
															+            filename = parts[-1]
														
 
															+            
														
 
															+            if folder_type == "html_convert":
														
 
															+                # 处理 html_convert 文件
														
 
															+                target_file = Path(target_dir) / keyword / filename
														
 
															+            elif folder_type == "crawled_urls":
														
 
															+                # 处理 PDF 文件
														
 
															+                target_file = Path(target_dir) / keyword / filename
														
 
															+            else:
														
 
															+                print(f"Unknown folder type: {folder_type}")
														
 
															+                continue
														
 
															+            
														
 
															+            # 检查目标文件是否存在
														
 
															+            if target_file.exists():
														
 
															+                print(f"File already exists, skipping: {target_file}")
														
 
															+                continue
														
 
															+            
														
 
															+            # dry run 模式
														
 
															+            if dry_run:
														
 
															+                print(f"[Dry Run] Would copy: {file_path} -> {target_file}")
														
 
															+                continue
														
 
															+            
														
 
															+            # 创建目标目录
														
 
															+            target_file.parent.mkdir(parents=True, exist_ok=True)
														
 
															+            
														
 
															+            # 复制文件
														
 
															+            shutil.copy(file_path, target_file)
														
 
															+            print(f"Copied: {file_path} -> {target_file.absolute()}")
														
 
															+            
														
 
															+        except Exception as e:
														
 
															+            print(f"Error processing {file_path}: {e}")
														
 
															+
														
 
															+
														
 
															+if __name__ == "__main__":
														
 
															+    # source_directory = "output/copy_exported_files"
														
 
															+    # target_directory = "output/results"
														
 
															+    # restore_files(source_directory, target_directory)
														
 
															+    
														
 
															+    # Reverse copy
														
 
															+    reverse_source_directory = "output/results"
														
 
															+    reverse_target_directory = "output/copy_exported_files"
														
 
															+    reverse_restore_files(reverse_source_directory, reverse_target_directory)
														
--- a/worker/html_convert/export_files/export_results.py
+++ b/worker/html_convert/export_files/export_results.py
--- a/worker/html_convert/pandoc.py
+++ b/worker/html_convert/pandoc.py
@@ -51,7 +51,7 @@ class PandocConverter:
 
															             cmd = [
														
 
															                 PANDOC_EXE,
														
 
															-                '-f', 'markdown+yaml_metadata_block',
														
 
															+                '-f', 'markdown+pipe_tables+simple_tables+multiline_tables',
														
 
															                 '-t', 'docx',
														
 
															                 '--reference-doc', self._get_reference_doc(),
														
 
															                 '-o', str(output_path),
														
@@ -61,7 +61,7 @@ class PandocConverter:
 
															             if self.include_toc:
														
 
															                 # Specify heading levels for TOC
														
 
															                 cmd.insert(-1, '--toc')
														
 
															-                cmd.insert(-1, '--toc-depth=3')  # Include up to level 3 headings
														
 
															+                cmd.insert(-1, '--toc-depth=2')  # Include up to level 3 headings
														
 
															             # Add verbose flag to capture more information about resource fetching
														
 
															             cmd.append('--verbose')
														
@@ -99,7 +99,7 @@ class PandocConverter:
 
															                 return False
														
 
															             # Get the HTML convert result
														
 
															-            html_convert = session.exec(
														
 
															+            html_convert:HtmlConvertResult = session.exec(
														
 
															                 select(HtmlConvertResult)
														
 
															                 .where(HtmlConvertResult.search_result_item_id == result_id)
														
 
															             ).first()
														
@@ -138,7 +138,7 @@ class PandocConverter:
 
															                     logger.info(f"Skipping already converted filtered markdown: {filtered_docx_path}")
														
 
															                     filtered_success = True
														
 
															                 else:
														
 
															-                    filtered_success = self.convert_md_to_docx(filtered_md_path, filtered_docx_path)
														
 
															+                    # filtered_success = self.convert_md_to_docx(filtered_md_path, filtered_docx_path)
														
 
															                     if filtered_success:
														
 
															                         html_convert.pandoc_docx_path = str(filtered_docx_path)
														
 
															                         html_convert.is_pandoc_converted = True
														
@@ -174,21 +174,21 @@ class PandocConverter:
 
															         normal_style.font.size = Pt(12)
														
 
															         doc.save(str(reference_doc))
														
 
															-def process_single_example(result_id: int):
														
 
															+def process_single_example(result_id: int, skip_existing=True):
														
 
															     # Process a single result example
														
 
															     docling_converter = DoclingConverter()
														
 
															     search_result_item = docling_converter.get_search_result_item(result_id)
														
 
															     if search_result_item.html_path.endswith('.html'):
														
 
															-        docling_converter.process_conversion_by_id(result_id)
														
 
															+        docling_converter.process_conversion_by_id(result_id, skip_existing=skip_existing)
														
 
															     crawl_filter = CrawlFilter()
														
 
															-    crawl_filter.process_filter_by_id(result_id)
														
 
															+    crawl_filter.process_filter_by_id(result_id, skip_existing=skip_existing)
														
 
															     pandoc_converter = PandocConverter(font_name="宋体", include_toc=True)
														
 
															-    success = pandoc_converter.process_single_result(result_id, skip_existing=True)
														
 
															+    logger.info(f"skip_existing {skip_existing}")
														
 
															+    success = pandoc_converter.process_single_result(result_id, skip_existing=skip_existing)
														
 
															     if success:
														
 
															         logger.info(f"Successfully processed result {result_id}")
														
 
															-        logger.info("Note: You may need to manually update the Table of Contents in Word after opening the document.")
														
 
															     else:
														
 
															         logger.error(f"Failed to process result {result_id}")
														
@@ -209,6 +209,10 @@ def process_all_results():
 
															                 logger.error(f"Error processing result {result_id}: {e}")
														
 
															 if __name__ == "__main__":
														
 
															-    # Example usage
														
 
															-    # process_single_example(6)
														
 
															-    process_all_results()
														
 
															+    # 计算运行时间
														
 
															+    import time
														
 
															+    start_time = time.time()
														
 
															+    process_single_example(996, skip_existing=False)
														
 
															+    end_time = time.time()
														
 
															+    print(f"Total time: {end_time - start_time} seconds")
														
 
															+    # process_all_results()
														
--- a/worker/search_engine/valid_google_search.py
+++ b/worker/search_engine/valid_google_search.py
@@ -1,9 +1,11 @@
 
															+import json
														
 
															+import os
														
 
															 import time
														
 
															 import re
														
 
															 import logging
														
 
															 from pathlib import Path
														
 
															 from typing import Dict, Optional, List
														
 
															-from DrissionPage import ChromiumPage
														
 
															+from DrissionPage import ChromiumOptions, ChromiumPage
														
 
															 from pydantic import BaseModel
														
 
															 from scrapling import Adaptor
														
 
															 from sqlmodel import Session, select
														
@@ -122,6 +124,26 @@ class ValidSearchResult:
 
															         # page.quit()
														
 
															         return result_item
														
 
															+    def load_dp_page(self, proxy=None, no_imgs=False):
														
 
															+        chrome_options = ChromiumOptions()
														
 
															+        if proxy:
														
 
															+            chrome_options.set_proxy(proxy)
														
 
															+        # 如果存在代理环境变量
														
 
															+        elif 'HTTP_PROXY' in os.environ:
														
 
															+            chrome_options.set_proxy(os.environ['HTTP_PROXY'])
														
 
															+        chrome_options.auto_port(True)
														
 
															+        chrome_options.no_imgs(no_imgs)
														
 
															+        
														
 
															+        logger.info(f"proxy {proxy}")
														
 
															+        page = ChromiumPage(chrome_options)
														
 
															+        tab = page.latest_tab
														
 
															+        tab.set.cookies
														
 
															+        return page
														
 
															+
														
 
															+    def load_cookies(self):
														
 
															+        path = Path(r'G:\code\upwork\zhang_crawl_bio\CF-Clearance-Scraper\cookies.json')
														
 
															+        export_cookies = json.loads(path.read_text(encoding='utf-8'))
														
 
															+        export_cookies.get('cookies')
														
 
															 def main():
														
 
															     vsr = ValidSearchResult()