|
|
@@ -1,38 +1,33 @@
|
|
|
import asyncio
|
|
|
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
|
|
-from mylib.base import OUTPUT_DIR
|
|
|
+import requests
|
|
|
+from bs4 import BeautifulSoup
|
|
|
+import json
|
|
|
+from dotenv import load_dotenv
|
|
|
+from scrapegraphai.graphs import ScriptCreatorGraph
|
|
|
+from scrapegraphai.utils import prettify_exec_info
|
|
|
|
|
|
-async def main():
|
|
|
- # 1) Reference your persistent data directory
|
|
|
- browser_config = BrowserConfig(
|
|
|
- headless=False, # 'True' for automated runs
|
|
|
- verbose=True,
|
|
|
- use_managed_browser=False, # Enables persistent browser strategy
|
|
|
- use_persistent_context=True,
|
|
|
- browser_type="chromium",
|
|
|
- user_data_dir=OUTPUT_DIR / "user_data_dir2"
|
|
|
- )
|
|
|
+load_dotenv()
|
|
|
+
|
|
|
+def t_main():
|
|
|
+ url = "https://perinim.github.io/projects"
|
|
|
+ response = requests.get(url)
|
|
|
+ soup = BeautifulSoup(response.content, 'html.parser')
|
|
|
|
|
|
- # 2) Standard crawl config
|
|
|
- search_key='Acalypha malabarica essential oil'
|
|
|
- start=30
|
|
|
- url = f"https://www.google.com/search?q={search_key}&start={start}"
|
|
|
- print(f"search url: {url}")
|
|
|
- crawl_config = CrawlerRunConfig(
|
|
|
- wait_for="css:.logged-in-content",
|
|
|
- url=url,
|
|
|
- )
|
|
|
- browser_config.url = url
|
|
|
- async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
|
+ news_list = []
|
|
|
|
|
|
- # crawler = AsyncWebCrawler(config=browser_config)
|
|
|
- # await crawler.start()
|
|
|
- result = await crawler.arun(
|
|
|
- url=url,
|
|
|
- config=crawl_config,
|
|
|
- )
|
|
|
- # save_to_pickle(result, OUTPUT_DIR / f"{search_key}.pickle")
|
|
|
- return result
|
|
|
+ for news in soup.find_all('div', class_='news-item'):
|
|
|
+ title = news.find('h2').text.strip()
|
|
|
+ description = news.find('p').text.strip()
|
|
|
+ news_list.append({
|
|
|
+ "title": title,
|
|
|
+ "description": description
|
|
|
+ })
|
|
|
|
|
|
+ print(json.dumps(news_list, indent=4))
|
|
|
+async def main():
|
|
|
+ t_main()
|
|
|
+ # s = '''python\nimport requests\nfrom bs4 import BeautifulSoup\nimport json\n\ndef main():\n url = \"https://perinim.github.io/projects\"\n response = requests.get(url)\n soup = BeautifulSoup(response.content, 'html.parser')\n \n news_list = []\n \n for news in soup.find_all('div', class_='news-item'):\n title = news.find('h2').text.strip()\n description = news.find('p').text.strip()\n news_list.append({\n \"title\": title,\n \"description\": description\n })\n \n print(json.dumps(news_list, indent=4))\n\nif __name__ == \"__main__\":\n main()\n'''
|
|
|
+ # print(s)
|
|
|
if __name__ == "__main__":
|
|
|
asyncio.run(main())
|