Преглед на файлове

导出数据,后续测试,临时保存

mrh преди 9 месеца
родител
ревизия
1835feda6a

+ 3 - 1
.gitignore

@@ -7,4 +7,6 @@ __pycache__
 .pytest_cache
 download
 local_proxy_pool/
-*.rdb
+*.rdb
+.env
+CF-Clearance-Scraper

+ 81 - 0
architecture.md

@@ -0,0 +1,81 @@
+
+
+# 测试
+```shell
+C:\Users\mg\.local\bin\aider.exe
+
+python -m pytest tests/test_google_search.py -v
+```
+
+# 工具
+
+## 架构相关
+3.3k ⭐  智能打开 s3 、hdfs 、 sftp 、 ftp 、 local 文件系统
+https://github.com/piskvorky/smart_open
+
+2.2k ⭐  PyFilesystem2 是一个抽象的文件系统接口
+https://github.com/PyFilesystem/pyfilesystem2
+
+
+## 爬虫和解析
+6.6k ⭐  爬虫框架大全
+https://github.com/BruceDone/awesome-crawler
+
+爬虫工具大全,搜索: 爬
+https://github.com/GitHubDaily/GitHubDaily/blob/cb618c17a72fc5a62248e5ac863d46fe0164487b/README.md?plain=1#L190
+
+
+330 ⭐  awesome 网页解析器数据提取大全
+https://github.com/kimtth/awesome-azure-openai-llm/blob/9b16663bb4e38bc8760f3f274b92dfcca0ada34a/section/app.md
+关键词: https://github.com/search?q=Trafilatura+awesome++language%3AMarkdown&type=code&l=Markdown
+
+
+
+
+34.9k ⭐ markitdown
+https://github.com/microsoft/markitdown
+
+22k ⭐ firecrawl AI 抓取干净结构化的页面 
+https://github.com/mendableai/firecrawl
+
+17.3k ⭐  python 用AI自动抓取网页信息,自动解析 markdown ,自定义提取的字段
+还能生成代码,为页面固定运行代码
+https://github.com/ScrapeGraphAI/Scrapegraph-ai/blob/main/README.md
+
+7.5K ⭐  无代码,鼠标点击元素,即可提取所有相似数据转换成表格或者 json , 或者 API 接口
+https://github.com/getmaxun/maxun
+
+241 ⭐  在代码中用自然语言描述,就能让AI提取有关数据,还能转换为 markdown ,也能用自然语言描述来自动化输入、回车、提交
+需要 LLM 秘钥
+https://github.com/dendrite-systems/dendrite-python-sdk
+
+readerLM-v2
+
+3.1k⭐ 网页解析器
+https://github.com/adbar/trafilatura
+各类工具的评估结果,我们可以看看哪些工具最强
+https://trafilatura.readthedocs.io/en/latest/evaluation.html#results-2022-05-18
+
+
+339 ⭐ 文章提取器,这是一个论文和评估基准
+https://github.com/scrapinghub/article-extraction-benchmark
+
+1.4k ⭐  快如闪电的解析器,比 bs4 快240倍 ,
+可以进行相似元素的搜索,加快搜索效率,智能导航,可以快速跳转到父级、子级、兄弟元素
+假如元素属性发生改变,它可以智能识别改变后的元素
+https://github.com/D4Vinci/Scrapling
+
+### 反机器人检测的浏览器
+365 ⭐  给原装的playwright打补丁,能够避免检测到自动化
+https://github.com/rebrowser/rebrowser-playwright-python
+
+0.98k ⭐  反机器人检测的浏览器
+https://github.com/daijro/camoufox
+浏览器启动信息: about:support
+
+机器人检测的网站,测试用
+https://www.browserscan.net/bot-detection
+"https://bot.sannysoft.com/"
+
+检查你的 代理ip 和浏览器指纹真伪
+https://www.browserscan.net

+ 79 - 0
tests/test_converter_base.py

@@ -0,0 +1,79 @@
+import pytest
+from pathlib import Path
+from worker.html_convert.converter_base import ConverterBase
+
+class TestConverterBase:
+    """Test suite for ConverterBase class"""
+    
+    @pytest.fixture
+    def converter(self):
+        return ConverterBase()
+    
+    def test_extract_content_after_first_h1(self, converter):
+        """Test extracting content after first H1"""
+        sample_md = """
+Some header content to skip
+
+## PERMALINK
+Copy
+
+# Main Title Here
+
+Content starts here
+"""
+        expected = "# Main Title Here\n\nContent starts here"
+        result = converter.extract_content_after_first_h1(sample_md)
+        assert result.strip() == expected.strip()
+        
+    def test_fix_inline_links(self, converter):
+        """Test fixing inline links"""
+        # Test case 1: Relative URL with domain
+        sample_md_1 = "[Author Name](https://example.com/<https://actual.com/path>)"
+        expected_1 = "[Author Name](https://actual.com/path)"
+        assert converter.fix_inline_links(sample_md_1) == expected_1
+        
+        # Test case 2: Absolute URL
+        sample_md_2 = "[PMC Copyright](https://pmc.ncbi.nlm.nih.gov/articles/PMC9919988/</about/copyright/>)"
+        expected_2 = "[PMC Copyright](https://pmc.ncbi.nlm.nih.gov/articles/PMC9919988/about/copyright/)"
+        assert converter.fix_inline_links(sample_md_2) == expected_2
+        
+        # Test case 3: Already correct link
+        sample_md_3 = "[Normal Link](https://correct.com/path)"
+        expected_3 = "[Normal Link](https://correct.com/path)"
+        assert converter.fix_inline_links(sample_md_3) == expected_3
+        
+        # Test case 4: Image link with empty alt text
+        sample_md_4 = "![](https://pub.mdpi-res.com/img/table.png) [](https://www.mdpi.com/1420-3049/29/22/<#table_body_display_molecules-29-05310-t003>)"
+        expected_4 = "![](https://pub.mdpi-res.com/img/table.png) [](https://www.mdpi.com/1420-3049/29/22/#table_body_display_molecules-29-05310-t003)"
+        assert converter.fix_inline_links(sample_md_4) == expected_4
+    
+    def test_add_url_header(self, converter):
+        """Test adding URL header"""
+        content = "Some markdown content"
+        url = "https://example.com"
+        expected = "[https://example.com](https://example.com)\n\nSome markdown content"
+        assert converter.add_url_header(content, url) == expected
+    
+    def test_filter_markdown_integration(self, converter):
+        """Integration test for filter_markdown"""
+        sample_md = """
+[ Skip to main content ](https://pmc.ncbi.nlm.nih.gov/articles/PMC9919988/<#main-content>)
+![](https://pmc.ncbi.nlm.nih.gov/static/img/us_flag.svg)
+
+## PERMALINK
+Copy
+
+# Main Title Here
+
+### Author Name
+[Author](https://example.com/<https://actual.com/path>)
+"""
+        url = "https://example.com"
+        expected = ("[https://example.com](https://example.com)\n\n"
+                    "# Main Title Here\n\n"
+                    "### Author Name\n"
+                    "[Author](https://actual.com/path)")
+        result = converter.filter_markdown(sample_md)
+        result = converter.add_url_header(result, url)
+        assert result.strip() == expected.strip()
+

+ 16 - 15
worker/celery/html_convert_tasks.py

@@ -3,6 +3,7 @@ from worker.html_convert.pandoc import process_single_example, process_all_resul
 from mylib.logu import get_logger
 from worker.search_engine.search_result_db import SearchResultItem, SearchResultManager
 from sqlmodel import Session, select
+from worker.search_engine.valid_google_search import ValidSearchResult
 
 logger = get_logger('pandoc_tasks')
 
@@ -44,21 +45,20 @@ def convert_all_results_task():
         return {"status": "failed", "error": str(e)}
 
 def test_task_process_all_results():
-    # Process all results in the database
-    db_manager = SearchResultManager()
-    with Session(db_manager.engine) as session:
-        # Fetch all IDs with explicit ordering
-        result_ids = session.exec(select(SearchResultItem.id, SearchResultItem.html_path).order_by(SearchResultItem.id)).all()
-        logger.info(f"Total results: {len(result_ids)}")
-        logger.info(f"First 5 result IDs: {result_ids[:5]}")
-        
-        for result_id, html_path in result_ids:
-            try:
-                if html_path.endswith('.html'):
-                    logger.info(f"Submitting task for SearchResultItem ID: {result_id}")
-                    convert_single_result_task.delay(result_id)
-            except Exception as e:
-                logger.error(f"Error processing result {result_id}: {e}")
+    # Process all valid results using ValidSearchResult
+    valid_search = ValidSearchResult()
+    valid_items = valid_search.get_valid_search_result_items()
+    
+    logger.info(f"Total valid results: {len(valid_items)}")
+    logger.info(f"First 5 valid result IDs: {[item.id for item in valid_items[:5]]}")
+    
+    for item in valid_items:
+        try:
+            if item.html_path and item.html_path.endswith('.html'):
+                logger.info(f"Submitting task for valid SearchResultItem ID: {item.id}")
+                convert_single_result_task.delay(item.id)
+        except Exception as e:
+            logger.error(f"Error processing valid result {item.id}: {e}")
 
 def clear_existing_tasks():
     """清除所有待处理的任务"""
@@ -71,6 +71,7 @@ def clear_existing_tasks():
 def main():
     test_task_process_all_results()
     # clear_existing_tasks()
+    pass
 
 if __name__ == "__main__":
     main()

+ 102 - 5
worker/html_convert/converter_base.py

@@ -36,7 +36,30 @@ class ConverterBase:
         return convert_dir
     
     def extract_content_after_first_h1(self, content: str) -> str:
-        """Extract content starting from the first H1 heading"""
+        """
+        Extract content starting from the first H1 heading.
+        This removes any content before the first H1 tag.
+        
+        Example:
+        Input:
+        ```
+        Some header content
+        
+        ## Subtitle
+        More content
+        
+        # First Main Title
+        
+        Actual content starts here
+        ```
+        
+        Output:
+        ```
+        # First Main Title
+        
+        Actual content starts here
+        ```
+        """
         h1_pattern = r'^# .+$'
         match = re.search(h1_pattern, content, re.MULTILINE)
         if match:
@@ -44,14 +67,41 @@ class ConverterBase:
         return content
     
     def fix_inline_links(self, content: str) -> str:
-        """Fix inline links by handling the special URL patterns"""
-        link_pattern = r'\[([^\]]+)\]\(([^<]*)<([^>]*)>\)'
+        """
+        Fix inline links by handling special URL patterns.
+        This method processes markdown links in the format:
+        [text](domain<url>) and converts them to [text](url).
+        
+        Handles three cases:
+        1. If URL is relative, it combines with domain
+        2. If URL is absolute, it uses the URL directly
+        3. If link has empty text but contains <> pattern
+        
+        Examples:
+        1. [Author](https://example.com/<https://actual.com/path>)
+           => [Author](https://actual.com/path)
+        
+        2. [Link](https://domain.com/<relative/path>)
+           => [Link](https://domain.com/relative/path)
+           
+        3. ![](image.png) [](https://domain.com/<#anchor>)
+           => ![](image.png) [](https://domain.com/#anchor)
+        """
+        link_pattern = r'\[([^\]]*)\]\(([^<]*)<([^>]*)>\)'
         
         def replace_link(match):
             text = match.group(1)
             domain = match.group(2)
             url = match.group(3)
             
+            if not text and url.startswith('#'):
+                # Handle empty text with anchor links
+                if domain:
+                    parsed_domain = urlparse(domain)
+                    base_url = f"{parsed_domain.scheme}://{parsed_domain.netloc}{parsed_domain.path}"
+                    return f'[]({base_url}{url})'
+                return f'[]({url})'
+            
             if url.startswith('/'):
                 if domain:
                     parsed_domain = urlparse(domain)
@@ -64,11 +114,58 @@ class ConverterBase:
         return re.sub(link_pattern, replace_link, content)
     
     def add_url_header(self, content: str, url: str) -> str:
-        """Add URL as a header at the top of the content"""
+        """
+        Add URL as a header at the top of the content.
+        The URL is added in markdown link format:
+        [URL](URL)
+        
+        Example:
+        Input:
+        ```
+        Some content
+        ```
+        
+        With URL: https://example.com
+        
+        Output:
+        ```
+        [https://example.com](https://example.com)
+        
+        Some content
+        ```
+        """
         return f"[{url}]({url})\n\n{content}"
     
     def filter_markdown(self, content: str) -> str:
-        """Filter markdown content according to specified rules"""
+        """
+        Filter markdown content according to specified rules:
+        1. Remove content before first H1
+        2. Fix inline links
+        3. (URL header is added separately)
+        
+        Example:
+        Input:
+        ```
+        [ Skip to main content ](https://pmc.ncbi.nlm.nih.gov/articles/PMC9919988/<#main-content>)
+        ![](https://pmc.ncbi.nlm.nih.gov/static/img/us_flag.svg)
+        
+        ## PERMALINK
+        Copy
+        
+        # Main Title Here
+        
+        ### Author Name
+        [Author](https://example.com/<https://actual.com/path>)
+        ```
+        
+        Output:
+        ```
+        # Main Title Here
+        
+        ### Author Name
+        [Author](https://actual.com/path)
+        ```
+        """
         content = self.extract_content_after_first_h1(content)
         logger.info(f"extract_content_after_first_h1: {content[:300]}")
         content = self.fix_inline_links(content)

+ 171 - 0
worker/html_convert/export_files/copy_files.py

@@ -0,0 +1,171 @@
+from pathlib import Path
+import shutil
+
+
+def list_files(directory) -> list[Path]:
+    """
+    遍历指定目录中的所有文件,返回文件路径列表。
+    :param directory: 要遍历的目录路径
+    :return: 文件路径列表
+    """
+    directory = Path(directory).resolve()
+    file_paths = []
+    for file in directory.rglob("*"):
+        if file.is_file():
+            file_paths.append(file)
+    return file_paths
+
+
+def parse_file_path(file_path):
+    """
+    解析文件路径,提取 {keyword}、{id} 和文件类型(docling 或 filtered)。
+    :param file_path: 文件路径
+    :return: (keyword, id, file_type, extension) 或 None
+    """
+    # 获取相对路径
+    file_path = Path(file_path).resolve()
+    rel_path = file_path.relative_to(Path("output/copy_exported_files").resolve())
+    parts = rel_path.parts
+    if len(parts) < 2:
+        raise ValueError(f"Invalid file path: {file_path}")
+    
+    keyword = parts[0]
+    filename = parts[-1]
+    name = Path(filename).stem
+    ext = Path(filename).suffix
+    
+    # 检查是否为PDF文件
+    if ext.lower() == ".pdf":
+        return keyword, name, "pdf", ext
+    
+    try:
+        id_part, file_type = name.rsplit("_", 1)
+    except ValueError:
+        print(f"Skipping invalid filename format: {filename}")
+        return None
+    
+    return keyword, id_part, file_type, ext
+
+
+def restore_files(source_dir, target_dir, dry_run=False):
+    """
+    将 source_dir 中的文件恢复到 target_dir,按照目标目录结构组织。
+    :param source_dir: 源目录路径
+    :param target_dir: 目标目录路径
+    :param dry_run: 是否为 dry run 模式(仅打印操作而不实际复制)
+    """
+    # 列出所有文件
+    file_paths = list_files(source_dir)
+    
+    for file_path in file_paths:
+        try:
+            # 检查文件扩展名
+            if file_path.suffix.lower() not in [".md", ".docx", ".pdf"]:
+                print(f"Skipping non-target file type: {file_path}")
+                continue
+            
+            # 解析文件路径
+            parsed = parse_file_path(file_path)
+            if not parsed:
+                continue
+            
+            keyword, id_part, file_type, ext = parsed
+            
+            # 构造目标路径
+            if file_type == "pdf":
+                target_subdir = Path(target_dir) / keyword / "crawled_urls"
+                target_file = target_subdir / f"{id_part}{ext}"
+            else:
+                target_subdir = Path(target_dir) / keyword / "html_convert"
+                target_file = target_subdir / f"{id_part}_{file_type}{ext}"
+            
+            target_subdir.mkdir(parents=True, exist_ok=True)
+            
+            # 检查目标文件是否存在
+            target_file_path = Path(target_file)
+            if target_file_path.exists():
+                print(f"File already exists, skipping: {target_file}")
+                continue
+            
+            # dry run 模式
+            if dry_run:
+                print(f"[Dry Run] Would copy: {file_path} -> {target_file.absolute()}")
+                continue
+            
+            # 复制文件
+            shutil.copy(file_path, target_file)
+            print(f"Copied: {file_path} -> {target_file}")
+            
+        except Exception as e:
+            print(f"Error processing {file_path}: {e}")
+
+
+def reverse_restore_files(source_dir, target_dir, dry_run=False):
+    """
+    将 source_dir (results) 中的文件复制回 target_dir (copy_exported_files)
+    :param source_dir: 源目录路径 (results)
+    :param target_dir: 目标目录路径 (copy_exported_files)
+    :param dry_run: 是否为 dry run 模式(默认开启)
+    """
+    # 列出所有文件
+    file_paths = list_files(source_dir)
+    
+    for file_path in file_paths[20000:]:
+        try:
+            # 检查文件扩展名
+            if file_path.suffix.lower() not in [".md", ".docx", ".pdf"]:
+                # print(f"Skipping non-target file type: {file_path}")
+                continue
+            
+            # 获取相对路径
+            rel_path = file_path.relative_to(Path(source_dir).resolve())
+            parts = rel_path.parts
+            
+            if len(parts) < 3:
+                print(f"Skipping invalid path: {file_path}")
+                continue
+            
+            keyword = parts[0]
+            folder_type = parts[1]
+            filename = parts[-1]
+            
+            if folder_type == "html_convert":
+                # 处理 html_convert 文件
+                target_file = Path(target_dir) / keyword / filename
+            elif folder_type == "crawled_urls":
+                # 处理 PDF 文件
+                target_file = Path(target_dir) / keyword / filename
+            else:
+                print(f"Unknown folder type: {folder_type}")
+                continue
+            
+            # 检查目标文件是否存在
+            if target_file.exists():
+                print(f"File already exists, skipping: {target_file}")
+                continue
+            
+            # dry run 模式
+            if dry_run:
+                print(f"[Dry Run] Would copy: {file_path} -> {target_file}")
+                continue
+            
+            # 创建目标目录
+            target_file.parent.mkdir(parents=True, exist_ok=True)
+            
+            # 复制文件
+            shutil.copy(file_path, target_file)
+            print(f"Copied: {file_path} -> {target_file.absolute()}")
+            
+        except Exception as e:
+            print(f"Error processing {file_path}: {e}")
+
+
+if __name__ == "__main__":
+    # source_directory = "output/copy_exported_files"
+    # target_directory = "output/results"
+    # restore_files(source_directory, target_directory)
+    
+    # Reverse copy
+    reverse_source_directory = "output/results"
+    reverse_target_directory = "output/copy_exported_files"
+    reverse_restore_files(reverse_source_directory, reverse_target_directory)

+ 0 - 0
worker/html_convert/export_files/export_results.py


+ 16 - 12
worker/html_convert/pandoc.py

@@ -51,7 +51,7 @@ class PandocConverter:
             
             cmd = [
                 PANDOC_EXE,
-                '-f', 'markdown+yaml_metadata_block',
+                '-f', 'markdown+pipe_tables+simple_tables+multiline_tables',
                 '-t', 'docx',
                 '--reference-doc', self._get_reference_doc(),
                 '-o', str(output_path),
@@ -61,7 +61,7 @@ class PandocConverter:
             if self.include_toc:
                 # Specify heading levels for TOC
                 cmd.insert(-1, '--toc')
-                cmd.insert(-1, '--toc-depth=3')  # Include up to level 3 headings
+                cmd.insert(-1, '--toc-depth=2')  # Include up to level 3 headings
             
             # Add verbose flag to capture more information about resource fetching
             cmd.append('--verbose')
@@ -99,7 +99,7 @@ class PandocConverter:
                 return False
             
             # Get the HTML convert result
-            html_convert = session.exec(
+            html_convert:HtmlConvertResult = session.exec(
                 select(HtmlConvertResult)
                 .where(HtmlConvertResult.search_result_item_id == result_id)
             ).first()
@@ -138,7 +138,7 @@ class PandocConverter:
                     logger.info(f"Skipping already converted filtered markdown: {filtered_docx_path}")
                     filtered_success = True
                 else:
-                    filtered_success = self.convert_md_to_docx(filtered_md_path, filtered_docx_path)
+                    # filtered_success = self.convert_md_to_docx(filtered_md_path, filtered_docx_path)
                     if filtered_success:
                         html_convert.pandoc_docx_path = str(filtered_docx_path)
                         html_convert.is_pandoc_converted = True
@@ -174,21 +174,21 @@ class PandocConverter:
         normal_style.font.size = Pt(12)
         doc.save(str(reference_doc))
 
-def process_single_example(result_id: int):
+def process_single_example(result_id: int, skip_existing=True):
     # Process a single result example
     docling_converter = DoclingConverter()
     search_result_item = docling_converter.get_search_result_item(result_id)
     if search_result_item.html_path.endswith('.html'):
-        docling_converter.process_conversion_by_id(result_id)
+        docling_converter.process_conversion_by_id(result_id, skip_existing=skip_existing)
     
     crawl_filter = CrawlFilter()
-    crawl_filter.process_filter_by_id(result_id)
+    crawl_filter.process_filter_by_id(result_id, skip_existing=skip_existing)
     
     pandoc_converter = PandocConverter(font_name="宋体", include_toc=True)
-    success = pandoc_converter.process_single_result(result_id, skip_existing=True)
+    logger.info(f"skip_existing {skip_existing}")
+    success = pandoc_converter.process_single_result(result_id, skip_existing=skip_existing)
     if success:
         logger.info(f"Successfully processed result {result_id}")
-        logger.info("Note: You may need to manually update the Table of Contents in Word after opening the document.")
     else:
         logger.error(f"Failed to process result {result_id}")
 
@@ -209,6 +209,10 @@ def process_all_results():
                 logger.error(f"Error processing result {result_id}: {e}")
 
 if __name__ == "__main__":
-    # Example usage
-    # process_single_example(6)
-    process_all_results()
+    # 计算运行时间
+    import time
+    start_time = time.time()
+    process_single_example(996, skip_existing=False)
+    end_time = time.time()
+    print(f"Total time: {end_time - start_time} seconds")
+    # process_all_results()

+ 23 - 1
worker/search_engine/valid_google_search.py

@@ -1,9 +1,11 @@
+import json
+import os
 import time
 import re
 import logging
 from pathlib import Path
 from typing import Dict, Optional, List
-from DrissionPage import ChromiumPage
+from DrissionPage import ChromiumOptions, ChromiumPage
 from pydantic import BaseModel
 from scrapling import Adaptor
 from sqlmodel import Session, select
@@ -122,6 +124,26 @@ class ValidSearchResult:
         # page.quit()
         return result_item
         
+    def load_dp_page(self, proxy=None, no_imgs=False):
+        chrome_options = ChromiumOptions()
+        if proxy:
+            chrome_options.set_proxy(proxy)
+        # 如果存在代理环境变量
+        elif 'HTTP_PROXY' in os.environ:
+            chrome_options.set_proxy(os.environ['HTTP_PROXY'])
+        chrome_options.auto_port(True)
+        chrome_options.no_imgs(no_imgs)
+        
+        logger.info(f"proxy {proxy}")
+        page = ChromiumPage(chrome_options)
+        tab = page.latest_tab
+        tab.set.cookies
+        return page
+
+    def load_cookies(self):
+        path = Path(r'G:\code\upwork\zhang_crawl_bio\CF-Clearance-Scraper\cookies.json')
+        export_cookies = json.loads(path.read_text(encoding='utf-8'))
+        export_cookies.get('cookies')
 
 def main():
     vsr = ValidSearchResult()