scrapling_t.py 2.0 KB

1234567891011121314151617181920212223242526272829303132333435363738394041
  1. """
  2. I only made this example to show how Scrapling features can be used to scrape a website without writing any selector
  3. so this script doesn't depend on the website structure.
  4. """
  5. import requests
  6. from scrapling import Adaptor
  7. from pathlib import Path
  8. from mylib.base import save_to_file
  9. def stackoverflow_demo():
  10. response = requests.get('https://stackoverflow.com/questions/tagged/web-scraping?sort=MostVotes&filters=NoAcceptedAnswer&edited=true&pagesize=50&page=2')
  11. page = Adaptor(response.text, url=response.url)
  12. # First we will extract the first question title and its author based on the text content
  13. first_question_title = page.find_by_text('Run Selenium Python Script on Remote Server')
  14. first_question_author = page.find_by_text('Ryan')
  15. # because this page changes a lot
  16. if first_question_title and first_question_author:
  17. # If you want you can extract other questions tags like below
  18. first_question = first_question_title.find_ancestor(
  19. lambda ancestor: ancestor.attrib.get('id') and 'question-summary' in ancestor.attrib.get('id')
  20. )
  21. rest_of_questions = first_question.find_similar()
  22. # But since nothing to rely on to extract other titles/authors from these elements without CSS/XPath selectors due to the website nature
  23. # We will get all the rest of the titles/authors in the page depending on the first title and the first author we got above as a starting point
  24. for i, (title, author) in enumerate(zip(first_question_title.find_similar(), first_question_author.find_similar()), start=1):
  25. print(i, title.text, author.text)
  26. def google_search_demo():
  27. file = Path(r'K:\code\upwork\zhang_crawl_bio\output\google_search\Acalypha manniana essential oil\10.html')
  28. html_content = file.read_text(encoding='utf-8')
  29. page = Adaptor(html_content)
  30. page.has_class('quote')
  31. print(page.find_by_text('Medicinal plants from the genus Acalypha'))
  32. def main():
  33. google_search_demo()
  34. if __name__ == "__main__":
  35. main()