| 1234567891011121314151617181920212223242526272829 |
- from newspaper import Article
- import os
- def main():
- # 本地 HTML 文件路径
- file_path = r"K:\code\upwork\zhang_crawl_bio\output\google_search\Acalypha martiana essential oil\all_page_html\3.html"
-
- # 读取本地 HTML 文件内容
- with open(file_path, 'r', encoding='utf-8') as file:
- html_content = file.read()
- # 创建一个 Article 对象
- first_article = Article(url='', language='en')
-
- # 手动设置 HTML 内容
- first_article.set_html(html_content)
-
- # 解析文章
- first_article.parse()
- # 打印文章标题和附加数据
- print(first_article.title)
- print(first_article.authors)
- print(first_article.additional_data)
- print(f"summary {first_article.summary}")
- print(f"url {first_article.url}")
- if __name__ == "__main__":
- main()
|