news_paper_t.py 823 B

1234567891011121314151617181920212223242526272829
  1. from newspaper import Article
  2. import os
  3. def main():
  4. # 本地 HTML 文件路径
  5. file_path = r"K:\code\upwork\zhang_crawl_bio\output\google_search\Acalypha martiana essential oil\all_page_html\3.html"
  6. # 读取本地 HTML 文件内容
  7. with open(file_path, 'r', encoding='utf-8') as file:
  8. html_content = file.read()
  9. # 创建一个 Article 对象
  10. first_article = Article(url='', language='en')
  11. # 手动设置 HTML 内容
  12. first_article.set_html(html_content)
  13. # 解析文章
  14. first_article.parse()
  15. # 打印文章标题和附加数据
  16. print(first_article.title)
  17. print(first_article.authors)
  18. print(first_article.additional_data)
  19. print(f"summary {first_article.summary}")
  20. print(f"url {first_article.url}")
  21. if __name__ == "__main__":
  22. main()