test_converter_base.py 3.1 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879
  1. import pytest
  2. from pathlib import Path
  3. from worker.html_convert.converter_base import ConverterBase
  4. class TestConverterBase:
  5. """Test suite for ConverterBase class"""
  6. @pytest.fixture
  7. def converter(self):
  8. return ConverterBase()
  9. def test_extract_content_after_first_h1(self, converter):
  10. """Test extracting content after first H1"""
  11. sample_md = """
  12. Some header content to skip
  13. ## PERMALINK
  14. Copy
  15. # Main Title Here
  16. Content starts here
  17. """
  18. expected = "# Main Title Here\n\nContent starts here"
  19. result = converter.extract_content_after_first_h1(sample_md)
  20. assert result.strip() == expected.strip()
  21. def test_fix_inline_links(self, converter):
  22. """Test fixing inline links"""
  23. # Test case 1: Relative URL with domain
  24. sample_md_1 = "[Author Name](https://example.com/<https://actual.com/path>)"
  25. expected_1 = "[Author Name](https://actual.com/path)"
  26. assert converter.fix_inline_links(sample_md_1) == expected_1
  27. # Test case 2: Absolute URL
  28. sample_md_2 = "[PMC Copyright](https://pmc.ncbi.nlm.nih.gov/articles/PMC9919988/</about/copyright/>)"
  29. expected_2 = "[PMC Copyright](https://pmc.ncbi.nlm.nih.gov/articles/PMC9919988/about/copyright/)"
  30. assert converter.fix_inline_links(sample_md_2) == expected_2
  31. # Test case 3: Already correct link
  32. sample_md_3 = "[Normal Link](https://correct.com/path)"
  33. expected_3 = "[Normal Link](https://correct.com/path)"
  34. assert converter.fix_inline_links(sample_md_3) == expected_3
  35. # Test case 4: Image link with empty alt text
  36. sample_md_4 = "![](https://pub.mdpi-res.com/img/table.png) [](https://www.mdpi.com/1420-3049/29/22/<#table_body_display_molecules-29-05310-t003>)"
  37. expected_4 = "![](https://pub.mdpi-res.com/img/table.png) [](https://www.mdpi.com/1420-3049/29/22/#table_body_display_molecules-29-05310-t003)"
  38. assert converter.fix_inline_links(sample_md_4) == expected_4
  39. def test_add_url_header(self, converter):
  40. """Test adding URL header"""
  41. content = "Some markdown content"
  42. url = "https://example.com"
  43. expected = "[https://example.com](https://example.com)\n\nSome markdown content"
  44. assert converter.add_url_header(content, url) == expected
  45. def test_filter_markdown_integration(self, converter):
  46. """Integration test for filter_markdown"""
  47. sample_md = """
  48. [ Skip to main content ](https://pmc.ncbi.nlm.nih.gov/articles/PMC9919988/<#main-content>)
  49. ![](https://pmc.ncbi.nlm.nih.gov/static/img/us_flag.svg)
  50. ## PERMALINK
  51. Copy
  52. # Main Title Here
  53. ### Author Name
  54. [Author](https://example.com/<https://actual.com/path>)
  55. """
  56. url = "https://example.com"
  57. expected = ("[https://example.com](https://example.com)\n\n"
  58. "# Main Title Here\n\n"
  59. "### Author Name\n"
  60. "[Author](https://actual.com/path)")
  61. result = converter.filter_markdown(sample_md)
  62. result = converter.add_url_header(result, url)
  63. assert result.strip() == expected.strip()