| 12345678910111213141516171819202122232425262728293031323334 |
- import asyncio
- import os
- from pathlib import Path
- import sys
- from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
- import requests
- from bs4 import BeautifulSoup
- import json
- from dotenv import load_dotenv
- import sys
- from pathlib import Path
- sys.path.append(Path(r'G:\code\upwork\zhang_crawl_bio'))
- os.environ['DB_URL'] = 'sqlite:///' + str(Path(r'G:\code\upwork\zhang_crawl_bio\output\search_results copy.db'))
- from worker.celery.search_client import get_uncompleted_keywords
- import yaml
- import socket
- from mylib.logu import logger
- # load_dotenv()
- config_path = Path(f"config/pc_configs/{'pc1'}.yaml")
- def t_main():
- res = get_uncompleted_keywords()
- print(res)
- print(len(res))
- async def main():
- t_main()
- return
- print(len(None))
- # s = '''python\nimport requests\nfrom bs4 import BeautifulSoup\nimport json\n\ndef main():\n url = \"https://perinim.github.io/projects\"\n response = requests.get(url)\n soup = BeautifulSoup(response.content, 'html.parser')\n \n news_list = []\n \n for news in soup.find_all('div', class_='news-item'):\n title = news.find('h2').text.strip()\n description = news.find('p').text.strip()\n news_list.append({\n \"title\": title,\n \"description\": description\n })\n \n print(json.dumps(news_list, indent=4))\n\nif __name__ == \"__main__\":\n main()\n'''
- # print(s)
- if __name__ == "__main__":
- asyncio.run(main())
|