part-time-job
/
zhang_crawl_bio


			
							12345678910111213141516171819202122232425262728293031323334
							import asyncio
import os
from pathlib import Path
import sys
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
import requests
from bs4 import BeautifulSoup
import json
from dotenv import load_dotenv
import sys
from pathlib import Path
sys.path.append(Path(r'G:\code\upwork\zhang_crawl_bio'))
os.environ['DB_URL'] = 'sqlite:///' + str(Path(r'G:\code\upwork\zhang_crawl_bio\output\search_results copy.db'))
from worker.celery.search_client import get_uncompleted_keywords
import yaml
import socket


from mylib.logu import logger
# load_dotenv()
config_path = Path(f"config/pc_configs/{'pc1'}.yaml")
def t_main():
    res = get_uncompleted_keywords()
    print(res)
    print(len(res))
async def main():
    t_main()
    return
    print(len(None))
    # s = '''python\nimport requests\nfrom bs4 import BeautifulSoup\nimport json\n\ndef main():\n    url = \"https://perinim.github.io/projects\"\n    response = requests.get(url)\n    soup = BeautifulSoup(response.content, 'html.parser')\n    \n    news_list = []\n    \n    for news in soup.find_all('div', class_='news-item'):\n        title = news.find('h2').text.strip()\n        description = news.find('p').text.strip()\n        news_list.append({\n            \"title\": title,\n            \"description\": description\n        })\n    \n    print(json.dumps(news_list, indent=4))\n\nif __name__ == \"__main__\":\n    main()\n'''
    # print(s)
if __name__ == "__main__":
    asyncio.run(main())