scrapling_t.py 3.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475
  1. """
  2. I only made this example to show how Scrapling features can be used to scrape a website without writing any selector
  3. so this script doesn't depend on the website structure.
  4. """
  5. import requests
  6. from scrapling import Adaptor
  7. from pathlib import Path
  8. from mylib.base import save_to_file
  9. from trafilatura import fetch_url, extract
  10. def stackoverflow_demo():
  11. response = requests.get('https://stackoverflow.com/questions/tagged/web-scraping?sort=MostVotes&filters=NoAcceptedAnswer&edited=true&pagesize=50&page=2')
  12. page = Adaptor(response.text, url=response.url)
  13. # First we will extract the first question title and its author based on the text content
  14. first_question_title = page.find_by_text('Run Selenium Python Script on Remote Server')
  15. first_question_author = page.find_by_text('Ryan')
  16. # because this page changes a lot
  17. if first_question_title and first_question_author:
  18. # If you want you can extract other questions tags like below
  19. first_question = first_question_title.find_ancestor(
  20. lambda ancestor: ancestor.attrib.get('id') and 'question-summary' in ancestor.attrib.get('id')
  21. )
  22. rest_of_questions = first_question.find_similar()
  23. # But since nothing to rely on to extract other titles/authors from these elements without CSS/XPath selectors due to the website nature
  24. # We will get all the rest of the titles/authors in the page depending on the first title and the first author we got above as a starting point
  25. for i, (title, author) in enumerate(zip(first_question_title.find_similar(), first_question_author.find_similar()), start=1):
  26. print(i, title.text, author.text)
  27. def analyze_html(html_content: str) -> dict:
  28. output_dir = Path(r'K:\code\upwork\zhang_crawl_bio\output\analyze')
  29. output_format_list = ["csv", "html", "json", "markdown", "txt", "xml", "xmltei"]
  30. for output_format in output_format_list:
  31. result = extract(html_content, output_format=output_format, with_metadata=True)
  32. save_path = save_to_file(result, Path(output_dir / 'ext').with_suffix(f'.{output_format}'))
  33. print(f"save_path: {save_path}")
  34. def find_search_div():
  35. file = Path(r'G:\code\upwork\zhang_crawl_bio\output\debug\查询不到搜索框-两个textarea.html')
  36. html_content = file.read_text(encoding='utf-8')
  37. page = Adaptor(html_content)
  38. # textarea_list = page.xpath('//body//form[@action="/search"]//textarea')
  39. textarea_list = page.xpath('//body//form[@action="/search"]//input')
  40. print("textarea_list:", textarea_list)
  41. for textarea in textarea_list:
  42. textarea.attrib
  43. print("------------------")
  44. print("textarea.tag:", textarea.tag)
  45. print("textarea.attrib:", textarea.attrib)
  46. # print("textarea.text:", textarea.text)
  47. # print("textarea.html_content:", textarea.html_content)
  48. # if 'search' in textarea.html_content.lower():
  49. # print("找到 search 关键字的 textarea")
  50. # print("textarea.path:", textarea.path)
  51. def find_verify_page():
  52. path = Path(r'G:\code\upwork\zhang_crawl_bio\output\results\Acantholimon erythraeum essential oil\crawled_urls\4801.html')
  53. content = path.read_text(encoding='utf-8')
  54. page = Adaptor(content)
  55. body = Adaptor(page.body)
  56. print("body:", body.get_all_text())
  57. print("body.tag:", "真人" in body.get_all_text())
  58. def main():
  59. # google_search_demo()
  60. # res = find_search_div()
  61. # print("res:", res)
  62. find_verify_page()
  63. # file = Path(r'G:\code\upwork\zhang_crawl_bio\output\debug\查询不到搜索框-两个textarea.html')
  64. # html_content = file.read_text(encoding='utf-8')
  65. # page = Adaptor(html_content)
  66. # textarea = page.xpath_first(res)
  67. # print("textarea:", textarea.html_content)
  68. if __name__ == "__main__":
  69. main()