| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222 |
- import datetime
- import json
- import os
- import sys
- import time
- import re
- import signal
- from DrissionPage import ChromiumPage
- from requests_html import HTML
- import requests_html
- from conf.settings import chome_options,OUTPUT,logger
- class Detection():
- def __init__(self) -> None:
- self.page = ChromiumPage(chome_options)
- # signal.signal(signal.SIGINT, self.handle_interrupt)
- # def __del__(self):
- # self.page.close_tabs()
- # def handle_interrupt(self, signum, frame):
- # logger.info("Received interrupt signal. Cleaning up...")
- # self.page.close_tabs()
- # sys.exit(0)
- '''
- string
- - 任何抖音链接、分享链接
- - 分享主页: 长按复制此条消息,打开抖音搜索,查看TA的更多作品。 https://v.douyin.com/iRMFSx59/
- - 直播分享: 3- #在抖音,记录美好生活#【麦穗儿🦋129】正在直播,来和我一起支持Ta吧。复制下方链接,打开【抖音】,直接观看直播! https://v.douyin.com/iRMFCkqF/ 1@5.com 12/29
- - 用户主页 https://www.douyin.com/user/MS4wLjABAAAAml99q0O4A4rk9SCLeVJXaWGi5e2pKxvQ1Oe0CjfTSHg
- - 直播间地址: https://live.douyin.com/363178125769?camera_id=0
- return
- - {"homepage": "https://www.douyin.com/user/MS4wLjABAAAAml99q0O4A4rk9SCLeVJXaWGi5e2pKxvQ1Oe0CjfTSHg"}
- - {"live_url": "https://live.douyin.com/363178125769"}
- - {}
- '''
- def analyze_link(self, string):
- res = re.search(r'(https?://([\w]*)\.douyin.com[^ ]*)', string)
- logger.debug(f'analyze_url:{string} re:{res if not res else res.groups()}')
-
- if not res:
- logger.info(f"格式错误 {string} ,支持以下链接形式:\n""""
- - 分享主页: 长按复制此条消息,打开抖音搜索,查看TA的更多作品。 https://v.douyin.com/iRMFSx59/
- - 直播分享: 3- #在抖音,记录美好生活#【麦穗儿🦋129】正在直播,来和我一起支持Ta吧。复制下方链接,打开【抖音】,直接观看直播! https://v.douyin.com/iRMFCkqF/ 1@5.com 12/29
- - 用户主页 https://www.douyin.com/user/MS4wLjABAAAAml99q0O4A4rk9SCLeVJXaWGi5e2pKxvQ1Oe0CjfTSHg
- - 直播间地址: https://live.douyin.com/363178125769?camera_id=0
- """)
- return
- url, subdomain = res.groups()
- ret = {}
- # 分享链接,短链接 https://v.douyin.com/iRMFSx59/
- if subdomain == 'v':
- ret = self.analyze_short_url(url)
- # 用户主页: https://www.douyin.com/user/MS4wLjA...
- elif subdomain == 'www':
- ret = self.analyze_homepage_url(url)
- # res.groups()[0] = https://live.douyin.com/363178125769?camera_id=0 或 https://live.douyin.com/192990287232
- elif subdomain == 'live':
- ret = self.analyze_live_url(url, check=True)
- logger.debug(f"ret:{ret}")
- return ret
- '''
- share_url:
- - 分享地址: 长按复制此条消息,打开抖音搜索,查看TA的更多作品。 https://v.douyin.com/iRMFSx59/
- - 直播分享: 3- #在抖音,记录美好生活#【麦穗儿🦋129】正在直播,来和我一起支持Ta吧。复制下方链接,打开【抖音】,直接观看直播! https://v.douyin.com/iRMFCkqF/ 1@5.com 12/29
- return:
- - {"homepage": "https://www.douyin.com/user/MS4wLjABAAAAml99q0O4A4rk9SCLeVJXaWGi5e2pKxvQ1Oe0CjfTSHg"}
- - {"live_url": "https://live.douyin.com/363178125769"}
- - None
- '''
- def analyze_short_url(self, share_url):
- logger.debug(f'share_url: {share_url}')
- self.page.get(share_url)
- self.page.wait.load_complete()
- # 等待重定向
- time.sleep(1)
- if 'user' in self.page.url:
- return {"homepage": self.page.url, "name":self.get_user_name('span[class="Nu66P_ba"]')}
- elif 'live' in self.page.url:
- return {"live_url": self.page.url, "name":self.get_user_name('div[data-e2e="live-room-nickname"]')}
-
- def get_user_name(self,selector):
- html = HTML(html=self.page.html)
- name_ele = html.find(selector, first=True)
- if name_ele:
- return name_ele.text
- return ''
- '''
- url:
- - 直播间地址: https://live.douyin.com/363178125769?camera_id=0
- - a 标签地址: https://live.douyin.com/21289753259?enter_from_merge=web_others_homepage&ent...
- return:
- - if check: {"live_url": "https://live.douyin.com/363178125769"} (正在直播)
- - None
- - 直播间 url 格式正确,但是没有这个直播间,网页提示:该内容无法查看
- - url 正则表达式地址错误
- '''
- def analyze_live_url(self, url, check=False):
- search_res = re.search(r'(https?://live.douyin.com/[\d]+)', url)
- if search_res:
- live_url = search_res.group(0)
- live_page_user_name = {}
- if check:
- exist = self.check_live_exist(live_url)
- if not exist:
- logger.warning(f"live room not exist:{live_url}")
- return
- # 只有在check的时候会访问 live URL ,此时可以通过 data-e2e="live-room-nickname" 找到页面元素
- live_page_user_name = {"name":self.get_user_name('div[data-e2e="live-room-nickname"]')}
- ret = {"live_url": live_url}
- ret.update(live_page_user_name)
- return ret
- else:
- logger.warning(f"live link error:{live_url}")
- '''
- url
- - 用户主页 https://www.douyin.com/user/MS4wLjABAAAAml99q0O4A4rk9SCLeVJXaWGi5e2pKxvQ1Oe0CjfTSHg
- return
- - 正在直播 {"live_url": "https://live.douyin.com/363178125769", "name":"xxx", "homepage":"https://www.douyin.com/user/MS4wLjABAAAAml99q0O4A4rk9SCLeVJXaWGi5e2pKxvQ1Oe0CjfTSHg"}
- - 没有直播 {"live_url": "", "name":"xxx", "homepage":"https://www.douyin.com/user/MS4wLjABAAAAml99q0O4A4rk9SCLeVJXaWGi5e2pKxvQ1Oe0CjfTSHg"}
- '''
- def analyze_homepage_url(self, url):
- logger.debug(f'{url}')
- self.page.get(url)
- self.page.wait.load_complete()
- ret = {"name":self.get_user_name('span[class="Nu66P_ba"]'), "homepage":url}
- live_url = self.get_user_homepage_live_link(self.page.html)
- if live_url:
- ret.update(live_url)
- else:
- ret.update({"live_url": ""})
- logger.debug(f'ret {ret}')
- return ret
- '''
- html:
- - 用户主页 html(https://www.douyin.com/user/MS4wLjABAAAAml99q0O4A4rk9SCLeVJXaWGi5e2pKxvQ1Oe0CjfTSHg)
- return:
- - {'live_url': 'https://live.douyin.com/324749687667', 'name': 'xxx'} (正在直播)
- - None (没有直播)
- '''
- def get_user_homepage_live_link(self, html):
- logger.debug(f"{html[:10]}...")
- html = HTML(html=html)
- user = html.find('div[data-e2e="user-detail"]', first=True)
- if not user:
- path = self.save_html(html)
- logger.warning(f"Can not find <div[data-e2e=\"user-detail\"]> in user home page.Html save to:{path}")
- return
- a = user.find('a', first=True)
- # 如果找到 a 标签,说明正在直播
- if a:
- logger.debug(a.attrs['href'])
- # 因为 analyze_live_url 没有使用 check 参数,不会主动去访问 live URL, 此时还在 homepage 页面,需要查找 homepage 中的用户名元素: span[class="Nu66P_ba"]
- return self.analyze_live_url(a.attrs['href'])
- else:
- logger.info("The user is not live streaming")
- return {}
-
-
- '''是否存在这个直播间地址
- url:
- - https://live.douyin.com/363178125769
- return:
- - 1 | 0
- '''
- def check_live_exist(self, url):
- self.page.get(url)
- element = self.page.ele("xpath://div[@class='LV2pOyWA __leftContainer']")
- if element:
- return 1
-
- '''
- url:
- - https://live.douyin.com/363178125769
- return:
- - 0 直播已结束
- - 1 正常
- - 2 链接不存在,页面提示“该内容暂时无法查看” (链接出错)
- - 3 直播链接仅限手机观看(不影响录制)
- '''
- def check_live_url(self, url):
- self.page.get(url)
- self.page.wait.load_complete()
- html = HTML(html=self.page.html)
- basic_ele = html.find('div.basicPlayer', first=True)
- if not basic_ele:
- return 2
- if '结束' in basic_ele.text:
- return 0
-
- def save_html(self, content):
- directory = OUTPUT
- if not os.path.exists(directory):
- os.mkdir(directory)
- # 获取目录中的文件
- files = os.listdir(directory)
- # 如果文件数量超过最大值,则删除最旧的文件
- if len(files) >= 30:
- files.sort(key=lambda x: os.path.getmtime(os.path.join(directory, x)))
- os.remove(os.path.join(directory, files[0]))
- # 保存新的文件
- filename = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S_%f")[:-3] + '.html'
- file_path = os.path.join(OUTPUT, filename)
- with open((file_path), 'w') as f:
- f.write(content)
- return file_path
-
- def main():
- d = Detection()
- # logger.info(d.analyze_link("https://live.douyin.com/324749687667"))
- # logger.info(d.analyze_link("https://www.douyin.com/user/MS4wLjABAAAAtMD22LJGHKwwo1V1WR3Rh5KOUs1C4Jk0Fl5EWzPzuOhbCbC2yUHK9vuPu7nZ_rm4"))
- # logger.info(d.analyze_live_url("https://live.douyin.com/3651787257", check=True))
- # logger.info(d.analyze_live_url("https://v.douyin.com/3651787257", check=True))
- if __name__ == "__main__":
- main()
|