t.py 1.4 KB

12345678910111213141516171819202122232425262728293031323334
  1. import asyncio
  2. import os
  3. from pathlib import Path
  4. import sys
  5. from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
  6. import requests
  7. from bs4 import BeautifulSoup
  8. import json
  9. from dotenv import load_dotenv
  10. import sys
  11. from pathlib import Path
  12. sys.path.append(Path(r'G:\code\upwork\zhang_crawl_bio'))
  13. os.environ['DB_URL'] = 'sqlite:///' + str(Path(r'G:\code\upwork\zhang_crawl_bio\output\search_results copy.db'))
  14. from worker.celery.search_client import get_uncompleted_keywords
  15. import yaml
  16. import socket
  17. from mylib.logu import logger
  18. # load_dotenv()
  19. config_path = Path(f"config/pc_configs/{'pc1'}.yaml")
  20. def t_main():
  21. res = get_uncompleted_keywords()
  22. print(res)
  23. print(len(res))
  24. async def main():
  25. t_main()
  26. return
  27. print(len(None))
  28. # s = '''python\nimport requests\nfrom bs4 import BeautifulSoup\nimport json\n\ndef main():\n url = \"https://perinim.github.io/projects\"\n response = requests.get(url)\n soup = BeautifulSoup(response.content, 'html.parser')\n \n news_list = []\n \n for news in soup.find_all('div', class_='news-item'):\n title = news.find('h2').text.strip()\n description = news.find('p').text.strip()\n news_list.append({\n \"title\": title,\n \"description\": description\n })\n \n print(json.dumps(news_list, indent=4))\n\nif __name__ == \"__main__\":\n main()\n'''
  29. # print(s)
  30. if __name__ == "__main__":
  31. asyncio.run(main())