part-time-job
/
zhang_crawl_bio


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475
							"""
I only made this example to show how Scrapling features can be used to scrape a website without writing any selector
    so this script doesn't depend on the website structure.
"""

import requests

from scrapling import Adaptor
from pathlib import Path
from mylib.base import save_to_file
from trafilatura import fetch_url, extract

def stackoverflow_demo():
    response = requests.get('https://stackoverflow.com/questions/tagged/web-scraping?sort=MostVotes&filters=NoAcceptedAnswer&edited=true&pagesize=50&page=2')
    page = Adaptor(response.text, url=response.url)
    # First we will extract the first question title and its author based on the text content
    first_question_title = page.find_by_text('Run Selenium Python Script on Remote Server')
    first_question_author = page.find_by_text('Ryan')
    # because this page changes a lot
    if first_question_title and first_question_author:
        # If you want you can extract other questions tags like below
        first_question = first_question_title.find_ancestor(
            lambda ancestor: ancestor.attrib.get('id') and 'question-summary' in ancestor.attrib.get('id')
        )
        rest_of_questions = first_question.find_similar()
        # But since nothing to rely on to extract other titles/authors from these elements without CSS/XPath selectors due to the website nature
        # We will get all the rest of the titles/authors in the page depending on the first title and the first author we got above as a starting point
        for i, (title, author) in enumerate(zip(first_question_title.find_similar(), first_question_author.find_similar()), start=1):
            print(i, title.text, author.text)

def analyze_html(html_content: str) -> dict:
    output_dir = Path(r'K:\code\upwork\zhang_crawl_bio\output\analyze')
    output_format_list = ["csv", "html", "json", "markdown", "txt", "xml", "xmltei"]
    for output_format in output_format_list:
        result = extract(html_content, output_format=output_format, with_metadata=True)
        save_path = save_to_file(result, Path(output_dir / 'ext').with_suffix(f'.{output_format}'))
        print(f"save_path: {save_path}")


def find_search_div():
    file = Path(r'G:\code\upwork\zhang_crawl_bio\output\debug\查询不到搜索框-两个textarea.html')
    html_content = file.read_text(encoding='utf-8')
    page = Adaptor(html_content)
    # textarea_list = page.xpath('//body//form[@action="/search"]//textarea')
    textarea_list = page.xpath('//body//form[@action="/search"]//input')
    print("textarea_list:", textarea_list)
    for textarea in textarea_list:
        textarea.attrib
        print("------------------")
        print("textarea.tag:", textarea.tag)
        print("textarea.attrib:", textarea.attrib)
        # print("textarea.text:", textarea.text)
        # print("textarea.html_content:", textarea.html_content)
        # if 'search' in textarea.html_content.lower():
        #     print("找到 search 关键字的 textarea")
        # print("textarea.path:", textarea.path)
def find_verify_page():
    path = Path(r'G:\code\upwork\zhang_crawl_bio\output\results\Acantholimon erythraeum essential oil\crawled_urls\4801.html')
    content = path.read_text(encoding='utf-8')
    page = Adaptor(content)
    body = Adaptor(page.body)
    print("body:", body.get_all_text())
    print("body.tag:", "真人" in body.get_all_text())
def main():
    # google_search_demo()
    # res = find_search_div()
    # print("res:", res)
    find_verify_page()
    # file = Path(r'G:\code\upwork\zhang_crawl_bio\output\debug\查询不到搜索框-两个textarea.html')
    # html_content = file.read_text(encoding='utf-8')
    # page = Adaptor(html_content)
    # textarea = page.xpath_first(res)
    # print("textarea:", textarea.html_content)
if __name__ == "__main__":
    main()