part-time-job
/
zhang_crawl_bio


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179
							from pathlib import Path
from typing import Optional
from sqlmodel import Session, select
from worker.search_engine.search_result_db import SearchResultItem, SearchResultManager
from worker.html_convert.models import HtmlConvertResult
from mylib.logu import get_logger
import re
from urllib.parse import urlparse, urljoin
from config.settings import GOOGLE_SEARCH_DIR

logger = get_logger('html_convert')

class ConverterBase:
    """Base class for all conversion operations"""
    
    def __init__(self):
        self.db_manager = SearchResultManager()
    def get_search_result_item(self, result_id: int) -> Optional[SearchResultItem]:
        """Get the search result item by ID"""
        with Session(self.db_manager.engine) as session:
            return session.get(SearchResultItem, result_id)

    def get_html_convert_result(self, result_id: int) -> Optional[HtmlConvertResult]:
        """Get HtmlConvertResult by SearchResultItem ID"""
        with Session(self.db_manager.engine) as session:
            return session.exec(
                select(HtmlConvertResult)
                .where(HtmlConvertResult.search_result_item_id == result_id)
            ).first()
    
    def ensure_convert_dir(self, html_path: Path) -> Path:
        """Ensure conversion directory exists"""
        convert_dir = html_path.parent.parent / "html_convert"
        convert_dir.mkdir(exist_ok=True)
        return convert_dir
    
    def extract_content_after_first_h1(self, content: str) -> str:
        """
        Extract content starting from the first H1 heading.
        This removes any content before the first H1 tag.
        
        Example:
        Input:
        ```
        Some header content
        
        ## Subtitle
        More content
        
        # First Main Title
        
        Actual content starts here
        ```
        
        Output:
        ```
        # First Main Title
        
        Actual content starts here
        ```
        """
        h1_pattern = r'^# .+$'
        match = re.search(h1_pattern, content, re.MULTILINE)
        if match:
            return content[match.start():]
        return content
    
    def fix_inline_links(self, content: str) -> str:
        """
        Fix inline links by handling special URL patterns.
        This method processes markdown links in the format:
        [text](domain<url>) and converts them to [text](url).
        
        Handles three cases:
        1. If URL is relative, it combines with domain
        2. If URL is absolute, it uses the URL directly
        3. If link has empty text but contains <> pattern
        
        Examples:
        1. [Author](https://example.com/<https://actual.com/path>)
           => [Author](https://actual.com/path)
        
        2. [Link](https://domain.com/<relative/path>)
           => [Link](https://domain.com/relative/path)
           
        3. ![](image.png) [](https://domain.com/<#anchor>)
           => ![](image.png) [](https://domain.com/#anchor)
        """
        link_pattern = r'\[([^\]]*)\]\(([^<]*)<([^>]*)>\)'
        
        def replace_link(match):
            text = match.group(1)
            domain = match.group(2)
            url = match.group(3)
            
            if not text and url.startswith('#'):
                # Handle empty text with anchor links
                if domain:
                    parsed_domain = urlparse(domain)
                    base_url = f"{parsed_domain.scheme}://{parsed_domain.netloc}{parsed_domain.path}"
                    return f'[]({base_url}{url})'
                return f'[]({url})'
            
            if url.startswith('/'):
                if domain:
                    parsed_domain = urlparse(domain)
                    base_url = f"{parsed_domain.scheme}://{parsed_domain.netloc}"
                    return f'[{text}]({urljoin(base_url, url)})'
                return f'[{text}]({url})'
            
            return f'[{text}]({url})'
        
        return re.sub(link_pattern, replace_link, content)
    
    def add_url_header(self, content: str, url: str) -> str:
        """
        Add URL as a header at the top of the content.
        The URL is added in markdown link format:
        [URL](URL)
        
        Example:
        Input:
        ```
        Some content
        ```
        
        With URL: https://example.com
        
        Output:
        ```
        [https://example.com](https://example.com)
        
        Some content
        ```
        """
        return f"[{url}]({url})\n\n{content}"
    
    def filter_markdown(self, content: str) -> str:
        """
        Filter markdown content according to specified rules:
        1. Remove content before first H1
        2. Fix inline links
        3. (URL header is added separately)
        
        Example:
        Input:
        ```
        [ Skip to main content ](https://pmc.ncbi.nlm.nih.gov/articles/PMC9919988/<#main-content>)
        ![](https://pmc.ncbi.nlm.nih.gov/static/img/us_flag.svg)
        
        ## PERMALINK
        Copy
        
        # Main Title Here
        
        ### Author Name
        [Author](https://example.com/<https://actual.com/path>)
        ```
        
        Output:
        ```
        # Main Title Here
        
        ### Author Name
        [Author](https://actual.com/path)
        ```
        """
        content = self.extract_content_after_first_h1(content)
        logger.info(f"extract_content_after_first_h1: {content[:300]}")
        content = self.fix_inline_links(content)
        logger.info(f"fix_inline_links: {content[:300]}")
        return content
    
    def save_html_convert_result(self, html_convert: HtmlConvertResult):
        """Save HtmlConvertResult to database"""
        with Session(self.db_manager.engine) as session:
            session.add(html_convert)
            session.commit()
            session.refresh(html_convert)