| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475 |
- """
- I only made this example to show how Scrapling features can be used to scrape a website without writing any selector
- so this script doesn't depend on the website structure.
- """
- import requests
- from scrapling import Adaptor
- from pathlib import Path
- from mylib.base import save_to_file
- from trafilatura import fetch_url, extract
- def stackoverflow_demo():
- response = requests.get('https://stackoverflow.com/questions/tagged/web-scraping?sort=MostVotes&filters=NoAcceptedAnswer&edited=true&pagesize=50&page=2')
- page = Adaptor(response.text, url=response.url)
- # First we will extract the first question title and its author based on the text content
- first_question_title = page.find_by_text('Run Selenium Python Script on Remote Server')
- first_question_author = page.find_by_text('Ryan')
- # because this page changes a lot
- if first_question_title and first_question_author:
- # If you want you can extract other questions tags like below
- first_question = first_question_title.find_ancestor(
- lambda ancestor: ancestor.attrib.get('id') and 'question-summary' in ancestor.attrib.get('id')
- )
- rest_of_questions = first_question.find_similar()
- # But since nothing to rely on to extract other titles/authors from these elements without CSS/XPath selectors due to the website nature
- # We will get all the rest of the titles/authors in the page depending on the first title and the first author we got above as a starting point
- for i, (title, author) in enumerate(zip(first_question_title.find_similar(), first_question_author.find_similar()), start=1):
- print(i, title.text, author.text)
- def analyze_html(html_content: str) -> dict:
- output_dir = Path(r'K:\code\upwork\zhang_crawl_bio\output\analyze')
- output_format_list = ["csv", "html", "json", "markdown", "txt", "xml", "xmltei"]
- for output_format in output_format_list:
- result = extract(html_content, output_format=output_format, with_metadata=True)
- save_path = save_to_file(result, Path(output_dir / 'ext').with_suffix(f'.{output_format}'))
- print(f"save_path: {save_path}")
- def find_search_div():
- file = Path(r'G:\code\upwork\zhang_crawl_bio\output\debug\查询不到搜索框-两个textarea.html')
- html_content = file.read_text(encoding='utf-8')
- page = Adaptor(html_content)
- # textarea_list = page.xpath('//body//form[@action="/search"]//textarea')
- textarea_list = page.xpath('//body//form[@action="/search"]//input')
- print("textarea_list:", textarea_list)
- for textarea in textarea_list:
- textarea.attrib
- print("------------------")
- print("textarea.tag:", textarea.tag)
- print("textarea.attrib:", textarea.attrib)
- # print("textarea.text:", textarea.text)
- # print("textarea.html_content:", textarea.html_content)
- # if 'search' in textarea.html_content.lower():
- # print("找到 search 关键字的 textarea")
- # print("textarea.path:", textarea.path)
- def find_verify_page():
- path = Path(r'G:\code\upwork\zhang_crawl_bio\output\results\Acantholimon erythraeum essential oil\crawled_urls\4801.html')
- content = path.read_text(encoding='utf-8')
- page = Adaptor(content)
- body = Adaptor(page.body)
- print("body:", body.get_all_text())
- print("body.tag:", "真人" in body.get_all_text())
- def main():
- # google_search_demo()
- # res = find_search_div()
- # print("res:", res)
- find_verify_page()
- # file = Path(r'G:\code\upwork\zhang_crawl_bio\output\debug\查询不到搜索框-两个textarea.html')
- # html_content = file.read_text(encoding='utf-8')
- # page = Adaptor(html_content)
- # textarea = page.xpath_first(res)
- # print("textarea:", textarea.html_content)
- if __name__ == "__main__":
- main()
|