|
|
@@ -0,0 +1,222 @@
|
|
|
+import datetime
|
|
|
+import json
|
|
|
+import os
|
|
|
+import sys
|
|
|
+import time
|
|
|
+import re
|
|
|
+import signal
|
|
|
+from DrissionPage import ChromiumPage
|
|
|
+from requests_html import HTML
|
|
|
+import requests_html
|
|
|
+from conf.settings import chome_options,OUTPUT,logger
|
|
|
+
|
|
|
+
|
|
|
+class Detection():
|
|
|
+ def __init__(self) -> None:
|
|
|
+ self.page = ChromiumPage(chome_options)
|
|
|
+ # signal.signal(signal.SIGINT, self.handle_interrupt)
|
|
|
+ # def __del__(self):
|
|
|
+ # self.page.close_tabs()
|
|
|
+ # def handle_interrupt(self, signum, frame):
|
|
|
+ # logger.info("Received interrupt signal. Cleaning up...")
|
|
|
+ # self.page.close_tabs()
|
|
|
+ # sys.exit(0)
|
|
|
+
|
|
|
+ '''
|
|
|
+ string
|
|
|
+ - 任何抖音链接、分享链接
|
|
|
+ - 分享主页: 长按复制此条消息,打开抖音搜索,查看TA的更多作品。 https://v.douyin.com/iRMFSx59/
|
|
|
+ - 直播分享: 3- #在抖音,记录美好生活#【麦穗儿🦋129】正在直播,来和我一起支持Ta吧。复制下方链接,打开【抖音】,直接观看直播! https://v.douyin.com/iRMFCkqF/ 1@5.com 12/29
|
|
|
+ - 用户主页 https://www.douyin.com/user/MS4wLjABAAAAml99q0O4A4rk9SCLeVJXaWGi5e2pKxvQ1Oe0CjfTSHg
|
|
|
+ - 直播间地址: https://live.douyin.com/363178125769?camera_id=0
|
|
|
+ return
|
|
|
+ - {"homepage": "https://www.douyin.com/user/MS4wLjABAAAAml99q0O4A4rk9SCLeVJXaWGi5e2pKxvQ1Oe0CjfTSHg"}
|
|
|
+ - {"live_url": "https://live.douyin.com/363178125769"}
|
|
|
+ - {}
|
|
|
+ '''
|
|
|
+ def analyze_link(self, string):
|
|
|
+ res = re.search(r'(https?://([\w]*)\.douyin.com[^ ]*)', string)
|
|
|
+ logger.debug(f'analyze_url:{string} re:{res if not res else res.groups()}')
|
|
|
+
|
|
|
+ if not res:
|
|
|
+ logger.info(f"格式错误 {string} ,支持以下链接形式:\n""""
|
|
|
+ - 分享主页: 长按复制此条消息,打开抖音搜索,查看TA的更多作品。 https://v.douyin.com/iRMFSx59/
|
|
|
+ - 直播分享: 3- #在抖音,记录美好生活#【麦穗儿🦋129】正在直播,来和我一起支持Ta吧。复制下方链接,打开【抖音】,直接观看直播! https://v.douyin.com/iRMFCkqF/ 1@5.com 12/29
|
|
|
+ - 用户主页 https://www.douyin.com/user/MS4wLjABAAAAml99q0O4A4rk9SCLeVJXaWGi5e2pKxvQ1Oe0CjfTSHg
|
|
|
+ - 直播间地址: https://live.douyin.com/363178125769?camera_id=0
|
|
|
+ """)
|
|
|
+ return
|
|
|
+ url, subdomain = res.groups()
|
|
|
+ ret = {}
|
|
|
+ # 分享链接,短链接 https://v.douyin.com/iRMFSx59/
|
|
|
+ if subdomain == 'v':
|
|
|
+ ret = self.analyze_short_url(url)
|
|
|
+ # 用户主页: https://www.douyin.com/user/MS4wLjA...
|
|
|
+ elif subdomain == 'www':
|
|
|
+ ret = self.analyze_homepage_url(url)
|
|
|
+ # res.groups()[0] = https://live.douyin.com/363178125769?camera_id=0 或 https://live.douyin.com/192990287232
|
|
|
+ elif subdomain == 'live':
|
|
|
+ ret = self.analyze_live_url(url, check=True)
|
|
|
+ logger.debug(f"ret:{ret}")
|
|
|
+ return ret
|
|
|
+
|
|
|
+ '''
|
|
|
+ share_url:
|
|
|
+ - 分享地址: 长按复制此条消息,打开抖音搜索,查看TA的更多作品。 https://v.douyin.com/iRMFSx59/
|
|
|
+ - 直播分享: 3- #在抖音,记录美好生活#【麦穗儿🦋129】正在直播,来和我一起支持Ta吧。复制下方链接,打开【抖音】,直接观看直播! https://v.douyin.com/iRMFCkqF/ 1@5.com 12/29
|
|
|
+ return:
|
|
|
+ - {"homepage": "https://www.douyin.com/user/MS4wLjABAAAAml99q0O4A4rk9SCLeVJXaWGi5e2pKxvQ1Oe0CjfTSHg"}
|
|
|
+ - {"live_url": "https://live.douyin.com/363178125769"}
|
|
|
+ - None
|
|
|
+ '''
|
|
|
+ def analyze_short_url(self, share_url):
|
|
|
+ logger.debug(f'share_url: {share_url}')
|
|
|
+ self.page.get(share_url)
|
|
|
+ self.page.wait.load_complete()
|
|
|
+ # 等待重定向
|
|
|
+ time.sleep(1)
|
|
|
+ if 'user' in self.page.url:
|
|
|
+ return {"homepage": self.page.url, "name":self.get_user_name('span[class="Nu66P_ba"]')}
|
|
|
+ elif 'live' in self.page.url:
|
|
|
+ return {"live_url": self.page.url, "name":self.get_user_name('div[data-e2e="live-room-nickname"]')}
|
|
|
+
|
|
|
+ def get_user_name(self,selector):
|
|
|
+ html = HTML(html=self.page.html)
|
|
|
+ name_ele = html.find(selector, first=True)
|
|
|
+ if name_ele:
|
|
|
+ return name_ele.text
|
|
|
+ return ''
|
|
|
+ '''
|
|
|
+ url:
|
|
|
+ - 直播间地址: https://live.douyin.com/363178125769?camera_id=0
|
|
|
+ - a 标签地址: https://live.douyin.com/21289753259?enter_from_merge=web_others_homepage&ent...
|
|
|
+ return:
|
|
|
+ - if check: {"live_url": "https://live.douyin.com/363178125769"} (正在直播)
|
|
|
+ - None
|
|
|
+ - 直播间 url 格式正确,但是没有这个直播间,网页提示:该内容无法查看
|
|
|
+ - url 正则表达式地址错误
|
|
|
+ '''
|
|
|
+ def analyze_live_url(self, url, check=False):
|
|
|
+ search_res = re.search(r'(https?://live.douyin.com/[\d]+)', url)
|
|
|
+ if search_res:
|
|
|
+ live_url = search_res.group(0)
|
|
|
+ live_page_user_name = {}
|
|
|
+ if check:
|
|
|
+ exist = self.check_live_exist(live_url)
|
|
|
+ if not exist:
|
|
|
+ logger.warning(f"live room not exist:{live_url}")
|
|
|
+ return
|
|
|
+ # 只有在check的时候会访问 live URL ,此时可以通过 data-e2e="live-room-nickname" 找到页面元素
|
|
|
+ live_page_user_name = {"name":self.get_user_name('div[data-e2e="live-room-nickname"]')}
|
|
|
+ ret = {"live_url": live_url}
|
|
|
+ ret.update(live_page_user_name)
|
|
|
+ return ret
|
|
|
+ else:
|
|
|
+ logger.warning(f"live link error:{live_url}")
|
|
|
+
|
|
|
+ '''
|
|
|
+ url
|
|
|
+ - 用户主页 https://www.douyin.com/user/MS4wLjABAAAAml99q0O4A4rk9SCLeVJXaWGi5e2pKxvQ1Oe0CjfTSHg
|
|
|
+ return
|
|
|
+ - 正在直播 {"live_url": "https://live.douyin.com/363178125769", "name":"xxx", "homepage":"https://www.douyin.com/user/MS4wLjABAAAAml99q0O4A4rk9SCLeVJXaWGi5e2pKxvQ1Oe0CjfTSHg"}
|
|
|
+ - 没有直播 {"live_url": "", "name":"xxx", "homepage":"https://www.douyin.com/user/MS4wLjABAAAAml99q0O4A4rk9SCLeVJXaWGi5e2pKxvQ1Oe0CjfTSHg"}
|
|
|
+ '''
|
|
|
+ def analyze_homepage_url(self, url):
|
|
|
+ logger.debug(f'{url}')
|
|
|
+ self.page.get(url)
|
|
|
+ self.page.wait.load_complete()
|
|
|
+ ret = {"name":self.get_user_name('span[class="Nu66P_ba"]'), "homepage":url}
|
|
|
+ live_url = self.get_user_homepage_live_link(self.page.html)
|
|
|
+ if live_url:
|
|
|
+ ret.update(live_url)
|
|
|
+ else:
|
|
|
+ ret.update({"live_url": ""})
|
|
|
+ logger.debug(f'ret {ret}')
|
|
|
+ return ret
|
|
|
+ '''
|
|
|
+ html:
|
|
|
+ - 用户主页 html(https://www.douyin.com/user/MS4wLjABAAAAml99q0O4A4rk9SCLeVJXaWGi5e2pKxvQ1Oe0CjfTSHg)
|
|
|
+ return:
|
|
|
+ - {'live_url': 'https://live.douyin.com/324749687667', 'name': 'xxx'} (正在直播)
|
|
|
+ - None (没有直播)
|
|
|
+ '''
|
|
|
+ def get_user_homepage_live_link(self, html):
|
|
|
+ logger.debug(f"{html[:10]}...")
|
|
|
+ html = HTML(html=html)
|
|
|
+ user = html.find('div[data-e2e="user-detail"]', first=True)
|
|
|
+ if not user:
|
|
|
+ path = self.save_html(html)
|
|
|
+ logger.warning(f"Can not find <div[data-e2e=\"user-detail\"]> in user home page.Html save to:{path}")
|
|
|
+ return
|
|
|
+ a = user.find('a', first=True)
|
|
|
+ # 如果找到 a 标签,说明正在直播
|
|
|
+ if a:
|
|
|
+ logger.debug(a.attrs['href'])
|
|
|
+ # 因为 analyze_live_url 没有使用 check 参数,不会主动去访问 live URL, 此时还在 homepage 页面,需要查找 homepage 中的用户名元素: span[class="Nu66P_ba"]
|
|
|
+ return self.analyze_live_url(a.attrs['href'])
|
|
|
+ else:
|
|
|
+ logger.info("The user is not live streaming")
|
|
|
+ return {}
|
|
|
+
|
|
|
+
|
|
|
+ '''是否存在这个直播间地址
|
|
|
+ url:
|
|
|
+ - https://live.douyin.com/363178125769
|
|
|
+ return:
|
|
|
+ - 1 | 0
|
|
|
+ '''
|
|
|
+ def check_live_exist(self, url):
|
|
|
+ self.page.get(url)
|
|
|
+ element = self.page.ele("xpath://div[@class='LV2pOyWA __leftContainer']")
|
|
|
+ if element:
|
|
|
+ return 1
|
|
|
+
|
|
|
+
|
|
|
+ '''
|
|
|
+ url:
|
|
|
+ - https://live.douyin.com/363178125769
|
|
|
+ return:
|
|
|
+ - 0 直播已结束
|
|
|
+ - 1 正常
|
|
|
+ - 2 链接不存在,页面提示“该内容暂时无法查看” (链接出错)
|
|
|
+ - 3 直播链接仅限手机观看(不影响录制)
|
|
|
+ '''
|
|
|
+ def check_live_url(self, url):
|
|
|
+ self.page.get(url)
|
|
|
+ self.page.wait.load_complete()
|
|
|
+ html = HTML(html=self.page.html)
|
|
|
+ basic_ele = html.find('div.basicPlayer', first=True)
|
|
|
+ if not basic_ele:
|
|
|
+ return 2
|
|
|
+ if '结束' in basic_ele.text:
|
|
|
+ return 0
|
|
|
+
|
|
|
+
|
|
|
+ def save_html(self, content):
|
|
|
+ directory = OUTPUT
|
|
|
+ if not os.path.exists(directory):
|
|
|
+ os.mkdir(directory)
|
|
|
+ # 获取目录中的文件
|
|
|
+ files = os.listdir(directory)
|
|
|
+
|
|
|
+ # 如果文件数量超过最大值,则删除最旧的文件
|
|
|
+ if len(files) >= 30:
|
|
|
+ files.sort(key=lambda x: os.path.getmtime(os.path.join(directory, x)))
|
|
|
+ os.remove(os.path.join(directory, files[0]))
|
|
|
+
|
|
|
+ # 保存新的文件
|
|
|
+ filename = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S_%f")[:-3] + '.html'
|
|
|
+ file_path = os.path.join(OUTPUT, filename)
|
|
|
+ with open((file_path), 'w') as f:
|
|
|
+ f.write(content)
|
|
|
+ return file_path
|
|
|
+
|
|
|
+def main():
|
|
|
+ d = Detection()
|
|
|
+ # logger.info(d.analyze_link("https://live.douyin.com/324749687667"))
|
|
|
+ # logger.info(d.analyze_link("https://www.douyin.com/user/MS4wLjABAAAAtMD22LJGHKwwo1V1WR3Rh5KOUs1C4Jk0Fl5EWzPzuOhbCbC2yUHK9vuPu7nZ_rm4"))
|
|
|
+ # logger.info(d.analyze_live_url("https://live.douyin.com/3651787257", check=True))
|
|
|
+ # logger.info(d.analyze_live_url("https://v.douyin.com/3651787257", check=True))
|
|
|
+
|
|
|
+if __name__ == "__main__":
|
|
|
+ main()
|