part-time-job
/
zhang_crawl_bio


			
				
					
						
						
							12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879
							import pytest
from pathlib import Path
from worker.html_convert.converter_base import ConverterBase

class TestConverterBase:
    """Test suite for ConverterBase class"""
    
    @pytest.fixture
    def converter(self):
        return ConverterBase()
    
    def test_extract_content_after_first_h1(self, converter):
        """Test extracting content after first H1"""
        sample_md = """
Some header content to skip

## PERMALINK
Copy

# Main Title Here

Content starts here
"""
        expected = "# Main Title Here\n\nContent starts here"
        result = converter.extract_content_after_first_h1(sample_md)
        assert result.strip() == expected.strip()
        
    def test_fix_inline_links(self, converter):
        """Test fixing inline links"""
        # Test case 1: Relative URL with domain
        sample_md_1 = "[Author Name](https://example.com/<https://actual.com/path>)"
        expected_1 = "[Author Name](https://actual.com/path)"
        assert converter.fix_inline_links(sample_md_1) == expected_1
        
        # Test case 2: Absolute URL
        sample_md_2 = "[PMC Copyright](https://pmc.ncbi.nlm.nih.gov/articles/PMC9919988/</about/copyright/>)"
        expected_2 = "[PMC Copyright](https://pmc.ncbi.nlm.nih.gov/articles/PMC9919988/about/copyright/)"
        assert converter.fix_inline_links(sample_md_2) == expected_2
        
        # Test case 3: Already correct link
        sample_md_3 = "[Normal Link](https://correct.com/path)"
        expected_3 = "[Normal Link](https://correct.com/path)"
        assert converter.fix_inline_links(sample_md_3) == expected_3
        
        # Test case 4: Image link with empty alt text
        sample_md_4 = "![](https://pub.mdpi-res.com/img/table.png) [](https://www.mdpi.com/1420-3049/29/22/<#table_body_display_molecules-29-05310-t003>)"
        expected_4 = "![](https://pub.mdpi-res.com/img/table.png) [](https://www.mdpi.com/1420-3049/29/22/#table_body_display_molecules-29-05310-t003)"
        assert converter.fix_inline_links(sample_md_4) == expected_4
    
    def test_add_url_header(self, converter):
        """Test adding URL header"""
        content = "Some markdown content"
        url = "https://example.com"
        expected = "[https://example.com](https://example.com)\n\nSome markdown content"
        assert converter.add_url_header(content, url) == expected
    
    def test_filter_markdown_integration(self, converter):
        """Integration test for filter_markdown"""
        sample_md = """
[ Skip to main content ](https://pmc.ncbi.nlm.nih.gov/articles/PMC9919988/<#main-content>)
![](https://pmc.ncbi.nlm.nih.gov/static/img/us_flag.svg)

## PERMALINK
Copy

# Main Title Here

### Author Name
[Author](https://example.com/<https://actual.com/path>)
"""
        url = "https://example.com"
        expected = ("[https://example.com](https://example.com)\n\n"
                    "# Main Title Here\n\n"
                    "### Author Name\n"
                    "[Author](https://actual.com/path)")
        result = converter.filter_markdown(sample_md)
        result = converter.add_url_header(result, url)
        assert result.strip() == expected.strip()