преди 9 месеца · 1835feda6a
--- a/.gitignore
+++ b/.gitignore
@@ -7,4 +7,6 @@ __pycache__
 
				 .pytest_cache
			
 
				 download
			
 
				 local_proxy_pool/
			
 
				-*.rdb
			
 
				+*.rdb
			
 
				+.env
			
 
				+CF-Clearance-Scraper
			
--- a/architecture.md
+++ b/architecture.md
@@ -0,0 +1,81 @@
 
				+
			
 
				+
			
 
				+# 测试
			
 
				+```shell
			
 
				+C:\Users\mg\.local\bin\aider.exe
			
 
				+
			
 
				+python -m pytest tests/test_google_search.py -v
			
 
				+```
			
 
				+
			
 
				+# 工具
			
 
				+
			
 
				+## 架构相关
			
 
				+3.3k ⭐  智能打开 s3 、hdfs 、 sftp 、 ftp 、 local 文件系统
			
 
				+https://github.com/piskvorky/smart_open
			
 
				+
			
 
				+2.2k ⭐  PyFilesystem2 是一个抽象的文件系统接口
			
 
				+https://github.com/PyFilesystem/pyfilesystem2
			
 
				+
			
 
				+
			
 
				+## 爬虫和解析
			
 
				+6.6k ⭐  爬虫框架大全
			
 
				+https://github.com/BruceDone/awesome-crawler
			
 
				+
			
 
				+爬虫工具大全，搜索： 爬
			
 
				+https://github.com/GitHubDaily/GitHubDaily/blob/cb618c17a72fc5a62248e5ac863d46fe0164487b/README.md?plain=1#L190
			
 
				+
			
 
				+
			
 
				+330 ⭐  awesome 网页解析器数据提取大全
			
 
				+https://github.com/kimtth/awesome-azure-openai-llm/blob/9b16663bb4e38bc8760f3f274b92dfcca0ada34a/section/app.md
			
 
				+关键词： https://github.com/search?q=Trafilatura+awesome++language%3AMarkdown&type=code&l=Markdown
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+34.9k ⭐ markitdown
			
 
				+https://github.com/microsoft/markitdown
			
 
				+
			
 
				+22k ⭐ firecrawl AI 抓取干净结构化的页面 
			
 
				+https://github.com/mendableai/firecrawl
			
 
				+
			
 
				+17.3k ⭐  python 用AI自动抓取网页信息，自动解析 markdown ，自定义提取的字段
			
 
				+还能生成代码，为页面固定运行代码
			
 
				+https://github.com/ScrapeGraphAI/Scrapegraph-ai/blob/main/README.md
			
 
				+
			
 
				+7.5K ⭐  无代码，鼠标点击元素，即可提取所有相似数据转换成表格或者 json ， 或者 API 接口
			
 
				+https://github.com/getmaxun/maxun
			
 
				+
			
 
				+241 ⭐  在代码中用自然语言描述，就能让AI提取有关数据，还能转换为 markdown ，也能用自然语言描述来自动化输入、回车、提交
			
 
				+需要 LLM 秘钥
			
 
				+https://github.com/dendrite-systems/dendrite-python-sdk
			
 
				+
			
 
				+readerLM-v2
			
 
				+
			
 
				+3.1k⭐ 网页解析器
			
 
				+https://github.com/adbar/trafilatura
			
 
				+各类工具的评估结果，我们可以看看哪些工具最强
			
 
				+https://trafilatura.readthedocs.io/en/latest/evaluation.html#results-2022-05-18
			
 
				+
			
 
				+
			
 
				+339 ⭐ 文章提取器，这是一个论文和评估基准
			
 
				+https://github.com/scrapinghub/article-extraction-benchmark
			
 
				+
			
 
				+1.4k ⭐  快如闪电的解析器，比 bs4 快240倍 ，
			
 
				+可以进行相似元素的搜索，加快搜索效率，智能导航，可以快速跳转到父级、子级、兄弟元素
			
 
				+假如元素属性发生改变，它可以智能识别改变后的元素
			
 
				+https://github.com/D4Vinci/Scrapling
			
 
				+
			
 
				+### 反机器人检测的浏览器
			
 
				+365 ⭐  给原装的playwright打补丁，能够避免检测到自动化
			
 
				+https://github.com/rebrowser/rebrowser-playwright-python
			
 
				+
			
 
				+0.98k ⭐  反机器人检测的浏览器
			
 
				+https://github.com/daijro/camoufox
			
 
				+浏览器启动信息： about:support
			
 
				+
			
 
				+机器人检测的网站，测试用
			
 
				+https://www.browserscan.net/bot-detection
			
 
				+"https://bot.sannysoft.com/"
			
 
				+
			
 
				+检查你的 代理ip 和浏览器指纹真伪
			
 
				+https://www.browserscan.net
			
--- a/tests/test_converter_base.py
+++ b/tests/test_converter_base.py
@@ -0,0 +1,79 @@
 
				+import pytest
			
 
				+from pathlib import Path
			
 
				+from worker.html_convert.converter_base import ConverterBase
			
 
				+
			
 
				+class TestConverterBase:
			
 
				+    """Test suite for ConverterBase class"""
			
 
				+    
			
 
				+    @pytest.fixture
			
 
				+    def converter(self):
			
 
				+        return ConverterBase()
			
 
				+    
			
 
				+    def test_extract_content_after_first_h1(self, converter):
			
 
				+        """Test extracting content after first H1"""
			
 
				+        sample_md = """
			
 
				+Some header content to skip
			
 
				+
			
 
				+## PERMALINK
			
 
				+Copy
			
 
				+
			
 
				+# Main Title Here
			
 
				+
			
 
				+Content starts here
			
 
				+"""
			
 
				+        expected = "# Main Title Here\n\nContent starts here"
			
 
				+        result = converter.extract_content_after_first_h1(sample_md)
			
 
				+        assert result.strip() == expected.strip()
			
 
				+        
			
 
				+    def test_fix_inline_links(self, converter):
			
 
				+        """Test fixing inline links"""
			
 
				+        # Test case 1: Relative URL with domain
			
 
				+        sample_md_1 = "[Author Name](https://example.com/<https://actual.com/path>)"
			
 
				+        expected_1 = "[Author Name](https://actual.com/path)"
			
 
				+        assert converter.fix_inline_links(sample_md_1) == expected_1
			
 
				+        
			
 
				+        # Test case 2: Absolute URL
			
 
				+        sample_md_2 = "[PMC Copyright](https://pmc.ncbi.nlm.nih.gov/articles/PMC9919988/</about/copyright/>)"
			
 
				+        expected_2 = "[PMC Copyright](https://pmc.ncbi.nlm.nih.gov/articles/PMC9919988/about/copyright/)"
			
 
				+        assert converter.fix_inline_links(sample_md_2) == expected_2
			
 
				+        
			
 
				+        # Test case 3: Already correct link
			
 
				+        sample_md_3 = "[Normal Link](https://correct.com/path)"
			
 
				+        expected_3 = "[Normal Link](https://correct.com/path)"
			
 
				+        assert converter.fix_inline_links(sample_md_3) == expected_3
			
 
				+        
			
 
				+        # Test case 4: Image link with empty alt text
			
 
				+        sample_md_4 = "![](https://pub.mdpi-res.com/img/table.png) [](https://www.mdpi.com/1420-3049/29/22/<#table_body_display_molecules-29-05310-t003>)"
			
 
				+        expected_4 = "![](https://pub.mdpi-res.com/img/table.png) [](https://www.mdpi.com/1420-3049/29/22/#table_body_display_molecules-29-05310-t003)"
			
 
				+        assert converter.fix_inline_links(sample_md_4) == expected_4
			
 
				+    
			
 
				+    def test_add_url_header(self, converter):
			
 
				+        """Test adding URL header"""
			
 
				+        content = "Some markdown content"
			
 
				+        url = "https://example.com"
			
 
				+        expected = "[https://example.com](https://example.com)\n\nSome markdown content"
			
 
				+        assert converter.add_url_header(content, url) == expected
			
 
				+    
			
 
				+    def test_filter_markdown_integration(self, converter):
			
 
				+        """Integration test for filter_markdown"""
			
 
				+        sample_md = """
			
 
				+[ Skip to main content ](https://pmc.ncbi.nlm.nih.gov/articles/PMC9919988/<#main-content>)
			
 
				+![](https://pmc.ncbi.nlm.nih.gov/static/img/us_flag.svg)
			
 
				+
			
 
				+## PERMALINK
			
 
				+Copy
			
 
				+
			
 
				+# Main Title Here
			
 
				+
			
 
				+### Author Name
			
 
				+[Author](https://example.com/<https://actual.com/path>)
			
 
				+"""
			
 
				+        url = "https://example.com"
			
 
				+        expected = ("[https://example.com](https://example.com)\n\n"
			
 
				+                    "# Main Title Here\n\n"
			
 
				+                    "### Author Name\n"
			
 
				+                    "[Author](https://actual.com/path)")
			
 
				+        result = converter.filter_markdown(sample_md)
			
 
				+        result = converter.add_url_header(result, url)
			
 
				+        assert result.strip() == expected.strip()
			
 
				+
			
--- a/worker/celery/html_convert_tasks.py
+++ b/worker/celery/html_convert_tasks.py
@@ -3,6 +3,7 @@ from worker.html_convert.pandoc import process_single_example, process_all_resul
 
				 from mylib.logu import get_logger
			
 
				 from worker.search_engine.search_result_db import SearchResultItem, SearchResultManager
			
 
				 from sqlmodel import Session, select
			
 
				+from worker.search_engine.valid_google_search import ValidSearchResult
			
 
				 
			
 
				 logger = get_logger('pandoc_tasks')
			
 
				 
			
@@ -44,21 +45,20 @@ def convert_all_results_task():
 
				         return {"status": "failed", "error": str(e)}
			
 
				 
			
 
				 def test_task_process_all_results():
			
 
				-    # Process all results in the database
			
 
				-    db_manager = SearchResultManager()
			
 
				-    with Session(db_manager.engine) as session:
			
 
				-        # Fetch all IDs with explicit ordering
			
 
				-        result_ids = session.exec(select(SearchResultItem.id, SearchResultItem.html_path).order_by(SearchResultItem.id)).all()
			
 
				-        logger.info(f"Total results: {len(result_ids)}")
			
 
				-        logger.info(f"First 5 result IDs: {result_ids[:5]}")
			
 
				-        
			
 
				-        for result_id, html_path in result_ids:
			
 
				-            try:
			
 
				-                if html_path.endswith('.html'):
			
 
				-                    logger.info(f"Submitting task for SearchResultItem ID: {result_id}")
			
 
				-                    convert_single_result_task.delay(result_id)
			
 
				-            except Exception as e:
			
 
				-                logger.error(f"Error processing result {result_id}: {e}")
			
 
				+    # Process all valid results using ValidSearchResult
			
 
				+    valid_search = ValidSearchResult()
			
 
				+    valid_items = valid_search.get_valid_search_result_items()
			
 
				+    
			
 
				+    logger.info(f"Total valid results: {len(valid_items)}")
			
 
				+    logger.info(f"First 5 valid result IDs: {[item.id for item in valid_items[:5]]}")
			
 
				+    
			
 
				+    for item in valid_items:
			
 
				+        try:
			
 
				+            if item.html_path and item.html_path.endswith('.html'):
			
 
				+                logger.info(f"Submitting task for valid SearchResultItem ID: {item.id}")
			
 
				+                convert_single_result_task.delay(item.id)
			
 
				+        except Exception as e:
			
 
				+            logger.error(f"Error processing valid result {item.id}: {e}")
			
 
				 
			
 
				 def clear_existing_tasks():
			
 
				     """清除所有待处理的任务"""
			
@@ -71,6 +71,7 @@ def clear_existing_tasks():
 
				 def main():
			
 
				     test_task_process_all_results()
			
 
				     # clear_existing_tasks()
			
 
				+    pass
			
 
				 
			
 
				 if __name__ == "__main__":
			
 
				     main()
			
--- a/worker/html_convert/converter_base.py
+++ b/worker/html_convert/converter_base.py
@@ -36,7 +36,30 @@ class ConverterBase:
 
				         return convert_dir
			
 
				     
			
 
				     def extract_content_after_first_h1(self, content: str) -> str:
			
 
				-        """Extract content starting from the first H1 heading"""
			
 
				+        """
			
 
				+        Extract content starting from the first H1 heading.
			
 
				+        This removes any content before the first H1 tag.
			
 
				+        
			
 
				+        Example:
			
 
				+        Input:
			
 
				+        ```
			
 
				+        Some header content
			
 
				+        
			
 
				+        ## Subtitle
			
 
				+        More content
			
 
				+        
			
 
				+        # First Main Title
			
 
				+        
			
 
				+        Actual content starts here
			
 
				+        ```
			
 
				+        
			
 
				+        Output:
			
 
				+        ```
			
 
				+        # First Main Title
			
 
				+        
			
 
				+        Actual content starts here
			
 
				+        ```
			
 
				+        """
			
 
				         h1_pattern = r'^# .+$'
			
 
				         match = re.search(h1_pattern, content, re.MULTILINE)
			
 
				         if match:
			
@@ -44,14 +67,41 @@ class ConverterBase:
 
				         return content
			
 
				     
			
 
				     def fix_inline_links(self, content: str) -> str:
			
 
				-        """Fix inline links by handling the special URL patterns"""
			
 
				-        link_pattern = r'\[([^\]]+)\]\(([^<]*)<([^>]*)>\)'
			
 
				+        """
			
 
				+        Fix inline links by handling special URL patterns.
			
 
				+        This method processes markdown links in the format:
			
 
				+        [text](domain<url>) and converts them to [text](url).
			
 
				+        
			
 
				+        Handles three cases:
			
 
				+        1. If URL is relative, it combines with domain
			
 
				+        2. If URL is absolute, it uses the URL directly
			
 
				+        3. If link has empty text but contains <> pattern
			
 
				+        
			
 
				+        Examples:
			
 
				+        1. [Author](https://example.com/<https://actual.com/path>)
			
 
				+           => [Author](https://actual.com/path)
			
 
				+        
			
 
				+        2. [Link](https://domain.com/<relative/path>)
			
 
				+           => [Link](https://domain.com/relative/path)
			
 
				+           
			
 
				+        3. ![](image.png) [](https://domain.com/<#anchor>)
			
 
				+           => ![](image.png) [](https://domain.com/#anchor)
			
 
				+        """
			
 
				+        link_pattern = r'\[([^\]]*)\]\(([^<]*)<([^>]*)>\)'
			
 
				         
			
 
				         def replace_link(match):
			
 
				             text = match.group(1)
			
 
				             domain = match.group(2)
			
 
				             url = match.group(3)
			
 
				             
			
 
				+            if not text and url.startswith('#'):
			
 
				+                # Handle empty text with anchor links
			
 
				+                if domain:
			
 
				+                    parsed_domain = urlparse(domain)
			
 
				+                    base_url = f"{parsed_domain.scheme}://{parsed_domain.netloc}{parsed_domain.path}"
			
 
				+                    return f'[]({base_url}{url})'
			
 
				+                return f'[]({url})'
			
 
				+            
			
 
				             if url.startswith('/'):
			
 
				                 if domain:
			
 
				                     parsed_domain = urlparse(domain)
			
@@ -64,11 +114,58 @@ class ConverterBase:
 
				         return re.sub(link_pattern, replace_link, content)
			
 
				     
			
 
				     def add_url_header(self, content: str, url: str) -> str:
			
 
				-        """Add URL as a header at the top of the content"""
			
 
				+        """
			
 
				+        Add URL as a header at the top of the content.
			
 
				+        The URL is added in markdown link format:
			
 
				+        [URL](URL)
			
 
				+        
			
 
				+        Example:
			
 
				+        Input:
			
 
				+        ```
			
 
				+        Some content
			
 
				+        ```
			
 
				+        
			
 
				+        With URL: https://example.com
			
 
				+        
			
 
				+        Output:
			
 
				+        ```
			
 
				+        [https://example.com](https://example.com)
			
 
				+        
			
 
				+        Some content
			
 
				+        ```
			
 
				+        """
			
 
				         return f"[{url}]({url})\n\n{content}"
			
 
				     
			
 
				     def filter_markdown(self, content: str) -> str:
			
 
				-        """Filter markdown content according to specified rules"""
			
 
				+        """
			
 
				+        Filter markdown content according to specified rules:
			
 
				+        1. Remove content before first H1
			
 
				+        2. Fix inline links
			
 
				+        3. (URL header is added separately)
			
 
				+        
			
 
				+        Example:
			
 
				+        Input:
			
 
				+        ```
			
 
				+        [ Skip to main content ](https://pmc.ncbi.nlm.nih.gov/articles/PMC9919988/<#main-content>)
			
 
				+        ![](https://pmc.ncbi.nlm.nih.gov/static/img/us_flag.svg)
			
 
				+        
			
 
				+        ## PERMALINK
			
 
				+        Copy
			
 
				+        
			
 
				+        # Main Title Here
			
 
				+        
			
 
				+        ### Author Name
			
 
				+        [Author](https://example.com/<https://actual.com/path>)
			
 
				+        ```
			
 
				+        
			
 
				+        Output:
			
 
				+        ```
			
 
				+        # Main Title Here
			
 
				+        
			
 
				+        ### Author Name
			
 
				+        [Author](https://actual.com/path)
			
 
				+        ```
			
 
				+        """
			
 
				         content = self.extract_content_after_first_h1(content)
			
 
				         logger.info(f"extract_content_after_first_h1: {content[:300]}")
			
 
				         content = self.fix_inline_links(content)
			
--- a/worker/html_convert/export_files/copy_files.py
+++ b/worker/html_convert/export_files/copy_files.py
@@ -0,0 +1,171 @@
 
				+from pathlib import Path
			
 
				+import shutil
			
 
				+
			
 
				+
			
 
				+def list_files(directory) -> list[Path]:
			
 
				+    """
			
 
				+    遍历指定目录中的所有文件，返回文件路径列表。
			
 
				+    :param directory: 要遍历的目录路径
			
 
				+    :return: 文件路径列表
			
 
				+    """
			
 
				+    directory = Path(directory).resolve()
			
 
				+    file_paths = []
			
 
				+    for file in directory.rglob("*"):
			
 
				+        if file.is_file():
			
 
				+            file_paths.append(file)
			
 
				+    return file_paths
			
 
				+
			
 
				+
			
 
				+def parse_file_path(file_path):
			
 
				+    """
			
 
				+    解析文件路径，提取 {keyword}、{id} 和文件类型（docling 或 filtered）。
			
 
				+    :param file_path: 文件路径
			
 
				+    :return: (keyword, id, file_type, extension) 或 None
			
 
				+    """
			
 
				+    # 获取相对路径
			
 
				+    file_path = Path(file_path).resolve()
			
 
				+    rel_path = file_path.relative_to(Path("output/copy_exported_files").resolve())
			
 
				+    parts = rel_path.parts
			
 
				+    if len(parts) < 2:
			
 
				+        raise ValueError(f"Invalid file path: {file_path}")
			
 
				+    
			
 
				+    keyword = parts[0]
			
 
				+    filename = parts[-1]
			
 
				+    name = Path(filename).stem
			
 
				+    ext = Path(filename).suffix
			
 
				+    
			
 
				+    # 检查是否为PDF文件
			
 
				+    if ext.lower() == ".pdf":
			
 
				+        return keyword, name, "pdf", ext
			
 
				+    
			
 
				+    try:
			
 
				+        id_part, file_type = name.rsplit("_", 1)
			
 
				+    except ValueError:
			
 
				+        print(f"Skipping invalid filename format: {filename}")
			
 
				+        return None
			
 
				+    
			
 
				+    return keyword, id_part, file_type, ext
			
 
				+
			
 
				+
			
 
				+def restore_files(source_dir, target_dir, dry_run=False):
			
 
				+    """
			
 
				+    将 source_dir 中的文件恢复到 target_dir，按照目标目录结构组织。
			
 
				+    :param source_dir: 源目录路径
			
 
				+    :param target_dir: 目标目录路径
			
 
				+    :param dry_run: 是否为 dry run 模式（仅打印操作而不实际复制）
			
 
				+    """
			
 
				+    # 列出所有文件
			
 
				+    file_paths = list_files(source_dir)
			
 
				+    
			
 
				+    for file_path in file_paths:
			
 
				+        try:
			
 
				+            # 检查文件扩展名
			
 
				+            if file_path.suffix.lower() not in [".md", ".docx", ".pdf"]:
			
 
				+                print(f"Skipping non-target file type: {file_path}")
			
 
				+                continue
			
 
				+            
			
 
				+            # 解析文件路径
			
 
				+            parsed = parse_file_path(file_path)
			
 
				+            if not parsed:
			
 
				+                continue
			
 
				+            
			
 
				+            keyword, id_part, file_type, ext = parsed
			
 
				+            
			
 
				+            # 构造目标路径
			
 
				+            if file_type == "pdf":
			
 
				+                target_subdir = Path(target_dir) / keyword / "crawled_urls"
			
 
				+                target_file = target_subdir / f"{id_part}{ext}"
			
 
				+            else:
			
 
				+                target_subdir = Path(target_dir) / keyword / "html_convert"
			
 
				+                target_file = target_subdir / f"{id_part}_{file_type}{ext}"
			
 
				+            
			
 
				+            target_subdir.mkdir(parents=True, exist_ok=True)
			
 
				+            
			
 
				+            # 检查目标文件是否存在
			
 
				+            target_file_path = Path(target_file)
			
 
				+            if target_file_path.exists():
			
 
				+                print(f"File already exists, skipping: {target_file}")
			
 
				+                continue
			
 
				+            
			
 
				+            # dry run 模式
			
 
				+            if dry_run:
			
 
				+                print(f"[Dry Run] Would copy: {file_path} -> {target_file.absolute()}")
			
 
				+                continue
			
 
				+            
			
 
				+            # 复制文件
			
 
				+            shutil.copy(file_path, target_file)
			
 
				+            print(f"Copied: {file_path} -> {target_file}")
			
 
				+            
			
 
				+        except Exception as e:
			
 
				+            print(f"Error processing {file_path}: {e}")
			
 
				+
			
 
				+
			
 
				+def reverse_restore_files(source_dir, target_dir, dry_run=False):
			
 
				+    """
			
 
				+    将 source_dir (results) 中的文件复制回 target_dir (copy_exported_files)
			
 
				+    :param source_dir: 源目录路径 (results)
			
 
				+    :param target_dir: 目标目录路径 (copy_exported_files)
			
 
				+    :param dry_run: 是否为 dry run 模式（默认开启）
			
 
				+    """
			
 
				+    # 列出所有文件
			
 
				+    file_paths = list_files(source_dir)
			
 
				+    
			
 
				+    for file_path in file_paths[20000:]:
			
 
				+        try:
			
 
				+            # 检查文件扩展名
			
 
				+            if file_path.suffix.lower() not in [".md", ".docx", ".pdf"]:
			
 
				+                # print(f"Skipping non-target file type: {file_path}")
			
 
				+                continue
			
 
				+            
			
 
				+            # 获取相对路径
			
 
				+            rel_path = file_path.relative_to(Path(source_dir).resolve())
			
 
				+            parts = rel_path.parts
			
 
				+            
			
 
				+            if len(parts) < 3:
			
 
				+                print(f"Skipping invalid path: {file_path}")
			
 
				+                continue
			
 
				+            
			
 
				+            keyword = parts[0]
			
 
				+            folder_type = parts[1]
			
 
				+            filename = parts[-1]
			
 
				+            
			
 
				+            if folder_type == "html_convert":
			
 
				+                # 处理 html_convert 文件
			
 
				+                target_file = Path(target_dir) / keyword / filename
			
 
				+            elif folder_type == "crawled_urls":
			
 
				+                # 处理 PDF 文件
			
 
				+                target_file = Path(target_dir) / keyword / filename
			
 
				+            else:
			
 
				+                print(f"Unknown folder type: {folder_type}")
			
 
				+                continue
			
 
				+            
			
 
				+            # 检查目标文件是否存在
			
 
				+            if target_file.exists():
			
 
				+                print(f"File already exists, skipping: {target_file}")
			
 
				+                continue
			
 
				+            
			
 
				+            # dry run 模式
			
 
				+            if dry_run:
			
 
				+                print(f"[Dry Run] Would copy: {file_path} -> {target_file}")
			
 
				+                continue
			
 
				+            
			
 
				+            # 创建目标目录
			
 
				+            target_file.parent.mkdir(parents=True, exist_ok=True)
			
 
				+            
			
 
				+            # 复制文件
			
 
				+            shutil.copy(file_path, target_file)
			
 
				+            print(f"Copied: {file_path} -> {target_file.absolute()}")
			
 
				+            
			
 
				+        except Exception as e:
			
 
				+            print(f"Error processing {file_path}: {e}")
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    # source_directory = "output/copy_exported_files"
			
 
				+    # target_directory = "output/results"
			
 
				+    # restore_files(source_directory, target_directory)
			
 
				+    
			
 
				+    # Reverse copy
			
 
				+    reverse_source_directory = "output/results"
			
 
				+    reverse_target_directory = "output/copy_exported_files"
			
 
				+    reverse_restore_files(reverse_source_directory, reverse_target_directory)
			
--- a/worker/html_convert/export_files/export_results.py
+++ b/worker/html_convert/export_files/export_results.py
--- a/worker/html_convert/pandoc.py
+++ b/worker/html_convert/pandoc.py
@@ -51,7 +51,7 @@ class PandocConverter:
 
				             
			
 
				             cmd = [
			
 
				                 PANDOC_EXE,
			
 
				-                '-f', 'markdown+yaml_metadata_block',
			
 
				+                '-f', 'markdown+pipe_tables+simple_tables+multiline_tables',
			
 
				                 '-t', 'docx',
			
 
				                 '--reference-doc', self._get_reference_doc(),
			
 
				                 '-o', str(output_path),
			
@@ -61,7 +61,7 @@ class PandocConverter:
 
				             if self.include_toc:
			
 
				                 # Specify heading levels for TOC
			
 
				                 cmd.insert(-1, '--toc')
			
 
				-                cmd.insert(-1, '--toc-depth=3')  # Include up to level 3 headings
			
 
				+                cmd.insert(-1, '--toc-depth=2')  # Include up to level 3 headings
			
 
				             
			
 
				             # Add verbose flag to capture more information about resource fetching
			
 
				             cmd.append('--verbose')
			
@@ -99,7 +99,7 @@ class PandocConverter:
 
				                 return False
			
 
				             
			
 
				             # Get the HTML convert result
			
 
				-            html_convert = session.exec(
			
 
				+            html_convert:HtmlConvertResult = session.exec(
			
 
				                 select(HtmlConvertResult)
			
 
				                 .where(HtmlConvertResult.search_result_item_id == result_id)
			
 
				             ).first()
			
@@ -138,7 +138,7 @@ class PandocConverter:
 
				                     logger.info(f"Skipping already converted filtered markdown: {filtered_docx_path}")
			
 
				                     filtered_success = True
			
 
				                 else:
			
 
				-                    filtered_success = self.convert_md_to_docx(filtered_md_path, filtered_docx_path)
			
 
				+                    # filtered_success = self.convert_md_to_docx(filtered_md_path, filtered_docx_path)
			
 
				                     if filtered_success:
			
 
				                         html_convert.pandoc_docx_path = str(filtered_docx_path)
			
 
				                         html_convert.is_pandoc_converted = True
			
@@ -174,21 +174,21 @@ class PandocConverter:
 
				         normal_style.font.size = Pt(12)
			
 
				         doc.save(str(reference_doc))
			
 
				 
			
 
				-def process_single_example(result_id: int):
			
 
				+def process_single_example(result_id: int, skip_existing=True):
			
 
				     # Process a single result example
			
 
				     docling_converter = DoclingConverter()
			
 
				     search_result_item = docling_converter.get_search_result_item(result_id)
			
 
				     if search_result_item.html_path.endswith('.html'):
			
 
				-        docling_converter.process_conversion_by_id(result_id)
			
 
				+        docling_converter.process_conversion_by_id(result_id, skip_existing=skip_existing)
			
 
				     
			
 
				     crawl_filter = CrawlFilter()
			
 
				-    crawl_filter.process_filter_by_id(result_id)
			
 
				+    crawl_filter.process_filter_by_id(result_id, skip_existing=skip_existing)
			
 
				     
			
 
				     pandoc_converter = PandocConverter(font_name="宋体", include_toc=True)
			
 
				-    success = pandoc_converter.process_single_result(result_id, skip_existing=True)
			
 
				+    logger.info(f"skip_existing {skip_existing}")
			
 
				+    success = pandoc_converter.process_single_result(result_id, skip_existing=skip_existing)
			
 
				     if success:
			
 
				         logger.info(f"Successfully processed result {result_id}")
			
 
				-        logger.info("Note: You may need to manually update the Table of Contents in Word after opening the document.")
			
 
				     else:
			
 
				         logger.error(f"Failed to process result {result_id}")
			
 
				 
			
@@ -209,6 +209,10 @@ def process_all_results():
 
				                 logger.error(f"Error processing result {result_id}: {e}")
			
 
				 
			
 
				 if __name__ == "__main__":
			
 
				-    # Example usage
			
 
				-    # process_single_example(6)
			
 
				-    process_all_results()
			
 
				+    # 计算运行时间
			
 
				+    import time
			
 
				+    start_time = time.time()
			
 
				+    process_single_example(996, skip_existing=False)
			
 
				+    end_time = time.time()
			
 
				+    print(f"Total time: {end_time - start_time} seconds")
			
 
				+    # process_all_results()
			
--- a/worker/search_engine/valid_google_search.py
+++ b/worker/search_engine/valid_google_search.py
@@ -1,9 +1,11 @@
 
				+import json
			
 
				+import os
			
 
				 import time
			
 
				 import re
			
 
				 import logging
			
 
				 from pathlib import Path
			
 
				 from typing import Dict, Optional, List
			
 
				-from DrissionPage import ChromiumPage
			
 
				+from DrissionPage import ChromiumOptions, ChromiumPage
			
 
				 from pydantic import BaseModel
			
 
				 from scrapling import Adaptor
			
 
				 from sqlmodel import Session, select
			
@@ -122,6 +124,26 @@ class ValidSearchResult:
 
				         # page.quit()
			
 
				         return result_item
			
 
				         
			
 
				+    def load_dp_page(self, proxy=None, no_imgs=False):
			
 
				+        chrome_options = ChromiumOptions()
			
 
				+        if proxy:
			
 
				+            chrome_options.set_proxy(proxy)
			
 
				+        # 如果存在代理环境变量
			
 
				+        elif 'HTTP_PROXY' in os.environ:
			
 
				+            chrome_options.set_proxy(os.environ['HTTP_PROXY'])
			
 
				+        chrome_options.auto_port(True)
			
 
				+        chrome_options.no_imgs(no_imgs)
			
 
				+        
			
 
				+        logger.info(f"proxy {proxy}")
			
 
				+        page = ChromiumPage(chrome_options)
			
 
				+        tab = page.latest_tab
			
 
				+        tab.set.cookies
			
 
				+        return page
			
 
				+
			
 
				+    def load_cookies(self):
			
 
				+        path = Path(r'G:\code\upwork\zhang_crawl_bio\CF-Clearance-Scraper\cookies.json')
			
 
				+        export_cookies = json.loads(path.read_text(encoding='utf-8'))
			
 
				+        export_cookies.get('cookies')
			
 
				 
			
 
				 def main():
			
 
				     vsr = ValidSearchResult()