| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192 |
- import base64
- import datetime
- import json
- import pickle
- import os
- import sys
- sys.path.append(os.path.dirname(os.path.dirname(__file__)))
- from conf.config import OUTPUT,PAGE_OUTPUT,logger
- from database.config import minio_block
- import prefect.runtime
- from dp.page import page
- import time
- from datetime import datetime
- from prefect import runtime
- from database.config import minio_client
- from database.s3 import S3Object,S3
- HOME_URL = 'https://www.douyin.com/user/self'
- tab=page.get_tab(url=HOME_URL)
- def get_tab(tab_id=None):
- if tab_id:
- return page.get_tab(tab_id)
- else:
- tab_id = runtime.flow_run.parameters.get('tab_id', None)
- if tab_id:
- return page.get_tab(tab_id)
- else:
- return page.tab
- def get_object_name_by_time():
- # 获取时分秒毫秒,并且符合路径的格式,如 023213_123.json
- now = datetime.now()
- # 格式化时间:小时、分钟、秒、毫秒
- formatted_time = now.strftime("%Y%m%d/%H%M%S_%f") # %f 提供了微秒,所以我们取前三个数字作为毫秒
- return formatted_time + '-{task_run.task_name}'
- def get_result(path:str):
- if path.startswith(minio_block.basepath):
- path = path[len(minio_block.basepath):]
- bytes = minio_block.read_path(path)
- json_data = json.loads(bytes)
- base64_data = json_data['data']
- decoded_data = base64.b64decode(base64_data)
- result = pickle.loads(decoded_data)
- return result
- def save_html_to_s3(file_name):
- page_dir = Path(r'I:\code\ai-yunying\live-online-people\output\page\\')
- file_path = page_dir/file_name
- f = open(file_path, 'w')
- f.write(tab.html)
- s3minio = S3(bucket='public',client=minio_client)
- base_path = '/md/ai-yunying/'
- obj_name = base_path+ s3minio.get_object_name_by_time() + '-'+ file_name
- res = s3minio.fput(r'I:\code\ai-yunying\live-online-people\output\page\\'+file_name, obj_name)
- logger.info(f"{res.bucket_name } {res.object_name} {res._http_headers}")
- import pathlib
- def save_page_info(file_name='', tab_id=None, local=True,s3=False):
- base_time_dir = pathlib.Path(datetime.now().strftime("%Y-%m-%d"))
- tab = get_tab(tab_id)
- png = tab.get_screenshot(as_bytes=True)
- if local:
- save_dir = OUTPUT/base_time_dir
- logger.info(f"{save_dir}")
- if not os.path.exists(save_dir):
- os.makedirs(save_dir)
- with open(save_dir/f'{file_name}.html', 'w') as f:
- f.write(tab.html)
- with open(save_dir/f'{file_name}.png', 'wb') as f:
- f.write(png)
- if s3:
- s3minio = S3(bucket='swl',client=minio_client)
- # base_path = pathlib.Path('/log') / base_time_dir
- # obj_name = base_path+ s3minio.get_object_name_by_time() + '-'+ file_name
- res = s3minio.put(tab.html, f'{file_name}.html')
- logger.info(f"{res.bucket_name } {res.object_name} {res._http_headers}")
- res = s3minio.put(png, f'{file_name}.png')
- return base_time_dir
-
- def main():
- save_page_info(file_name='点击陌生人对话框', tab_id='A017888A62FE53FAD9D85ED2662FEA34')
- if __name__ == "__main__":
- main()
|