scrapling_t.py 1.7 KB

123456789101112131415161718192021222324252627282930313233
  1. """
  2. I only made this example to show how Scrapling features can be used to scrape a website without writing any selector
  3. so this script doesn't depend on the website structure.
  4. """
  5. import requests
  6. from scrapling import Adaptor
  7. def stackoverflow_demo():
  8. response = requests.get('https://stackoverflow.com/questions/tagged/web-scraping?sort=MostVotes&filters=NoAcceptedAnswer&edited=true&pagesize=50&page=2')
  9. page = Adaptor(response.text, url=response.url)
  10. # First we will extract the first question title and its author based on the text content
  11. first_question_title = page.find_by_text('Run Selenium Python Script on Remote Server')
  12. first_question_author = page.find_by_text('Ryan')
  13. # because this page changes a lot
  14. if first_question_title and first_question_author:
  15. # If you want you can extract other questions tags like below
  16. first_question = first_question_title.find_ancestor(
  17. lambda ancestor: ancestor.attrib.get('id') and 'question-summary' in ancestor.attrib.get('id')
  18. )
  19. rest_of_questions = first_question.find_similar()
  20. # But since nothing to rely on to extract other titles/authors from these elements without CSS/XPath selectors due to the website nature
  21. # We will get all the rest of the titles/authors in the page depending on the first title and the first author we got above as a starting point
  22. for i, (title, author) in enumerate(zip(first_question_title.find_similar(), first_question_author.find_similar()), start=1):
  23. print(i, title.text, author.text)
  24. def google_search_demo():
  25. search_key = "python"
  26. def main():
  27. google_search_demo()
  28. if __name__ == "__main__":
  29. main()