| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179 |
- from pathlib import Path
- from typing import Optional
- from sqlmodel import Session, select
- from worker.search_engine.search_result_db import SearchResultItem, SearchResultManager
- from worker.html_convert.models import HtmlConvertResult
- from mylib.logu import get_logger
- import re
- from urllib.parse import urlparse, urljoin
- from config.settings import GOOGLE_SEARCH_DIR
- logger = get_logger('html_convert')
- class ConverterBase:
- """Base class for all conversion operations"""
-
- def __init__(self):
- self.db_manager = SearchResultManager()
- def get_search_result_item(self, result_id: int) -> Optional[SearchResultItem]:
- """Get the search result item by ID"""
- with Session(self.db_manager.engine) as session:
- return session.get(SearchResultItem, result_id)
- def get_html_convert_result(self, result_id: int) -> Optional[HtmlConvertResult]:
- """Get HtmlConvertResult by SearchResultItem ID"""
- with Session(self.db_manager.engine) as session:
- return session.exec(
- select(HtmlConvertResult)
- .where(HtmlConvertResult.search_result_item_id == result_id)
- ).first()
-
- def ensure_convert_dir(self, html_path: Path) -> Path:
- """Ensure conversion directory exists"""
- convert_dir = html_path.parent.parent / "html_convert"
- convert_dir.mkdir(exist_ok=True)
- return convert_dir
-
- def extract_content_after_first_h1(self, content: str) -> str:
- """
- Extract content starting from the first H1 heading.
- This removes any content before the first H1 tag.
-
- Example:
- Input:
- ```
- Some header content
-
- ## Subtitle
- More content
-
- # First Main Title
-
- Actual content starts here
- ```
-
- Output:
- ```
- # First Main Title
-
- Actual content starts here
- ```
- """
- h1_pattern = r'^# .+$'
- match = re.search(h1_pattern, content, re.MULTILINE)
- if match:
- return content[match.start():]
- return content
-
- def fix_inline_links(self, content: str) -> str:
- """
- Fix inline links by handling special URL patterns.
- This method processes markdown links in the format:
- [text](domain<url>) and converts them to [text](url).
-
- Handles three cases:
- 1. If URL is relative, it combines with domain
- 2. If URL is absolute, it uses the URL directly
- 3. If link has empty text but contains <> pattern
-
- Examples:
- 1. [Author](https://example.com/<https://actual.com/path>)
- => [Author](https://actual.com/path)
-
- 2. [Link](https://domain.com/<relative/path>)
- => [Link](https://domain.com/relative/path)
-
- 3.  [](https://domain.com/<#anchor>)
- =>  [](https://domain.com/#anchor)
- """
- link_pattern = r'\[([^\]]*)\]\(([^<]*)<([^>]*)>\)'
-
- def replace_link(match):
- text = match.group(1)
- domain = match.group(2)
- url = match.group(3)
-
- if not text and url.startswith('#'):
- # Handle empty text with anchor links
- if domain:
- parsed_domain = urlparse(domain)
- base_url = f"{parsed_domain.scheme}://{parsed_domain.netloc}{parsed_domain.path}"
- return f'[]({base_url}{url})'
- return f'[]({url})'
-
- if url.startswith('/'):
- if domain:
- parsed_domain = urlparse(domain)
- base_url = f"{parsed_domain.scheme}://{parsed_domain.netloc}"
- return f'[{text}]({urljoin(base_url, url)})'
- return f'[{text}]({url})'
-
- return f'[{text}]({url})'
-
- return re.sub(link_pattern, replace_link, content)
-
- def add_url_header(self, content: str, url: str) -> str:
- """
- Add URL as a header at the top of the content.
- The URL is added in markdown link format:
- [URL](URL)
-
- Example:
- Input:
- ```
- Some content
- ```
-
- With URL: https://example.com
-
- Output:
- ```
- [https://example.com](https://example.com)
-
- Some content
- ```
- """
- return f"[{url}]({url})\n\n{content}"
-
- def filter_markdown(self, content: str) -> str:
- """
- Filter markdown content according to specified rules:
- 1. Remove content before first H1
- 2. Fix inline links
- 3. (URL header is added separately)
-
- Example:
- Input:
- ```
- [ Skip to main content ](https://pmc.ncbi.nlm.nih.gov/articles/PMC9919988/<#main-content>)
- 
-
- ## PERMALINK
- Copy
-
- # Main Title Here
-
- ### Author Name
- [Author](https://example.com/<https://actual.com/path>)
- ```
-
- Output:
- ```
- # Main Title Here
-
- ### Author Name
- [Author](https://actual.com/path)
- ```
- """
- content = self.extract_content_after_first_h1(content)
- logger.info(f"extract_content_after_first_h1: {content[:300]}")
- content = self.fix_inline_links(content)
- logger.info(f"fix_inline_links: {content[:300]}")
- return content
-
- def save_html_convert_result(self, html_convert: HtmlConvertResult):
- """Save HtmlConvertResult to database"""
- with Session(self.db_manager.engine) as session:
- session.add(html_convert)
- session.commit()
- session.refresh(html_convert)
|