detection.py 9.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222
  1. import datetime
  2. import json
  3. import os
  4. import sys
  5. import time
  6. import re
  7. import signal
  8. from DrissionPage import ChromiumPage
  9. from requests_html import HTML
  10. import requests_html
  11. from conf.settings import chome_options,OUTPUT,logger
  12. class Detection():
  13. def __init__(self) -> None:
  14. self.page = ChromiumPage(chome_options)
  15. # signal.signal(signal.SIGINT, self.handle_interrupt)
  16. # def __del__(self):
  17. # self.page.close_tabs()
  18. # def handle_interrupt(self, signum, frame):
  19. # logger.info("Received interrupt signal. Cleaning up...")
  20. # self.page.close_tabs()
  21. # sys.exit(0)
  22. '''
  23. string
  24. - 任何抖音链接、分享链接
  25. - 分享主页: 长按复制此条消息,打开抖音搜索,查看TA的更多作品。 https://v.douyin.com/iRMFSx59/
  26. - 直播分享: 3- #在抖音,记录美好生活#【麦穗儿🦋129】正在直播,来和我一起支持Ta吧。复制下方链接,打开【抖音】,直接观看直播! https://v.douyin.com/iRMFCkqF/ 1@5.com 12/29
  27. - 用户主页 https://www.douyin.com/user/MS4wLjABAAAAml99q0O4A4rk9SCLeVJXaWGi5e2pKxvQ1Oe0CjfTSHg
  28. - 直播间地址: https://live.douyin.com/363178125769?camera_id=0
  29. return
  30. - {"homepage": "https://www.douyin.com/user/MS4wLjABAAAAml99q0O4A4rk9SCLeVJXaWGi5e2pKxvQ1Oe0CjfTSHg"}
  31. - {"live_url": "https://live.douyin.com/363178125769"}
  32. - {}
  33. '''
  34. def analyze_link(self, string):
  35. res = re.search(r'(https?://([\w]*)\.douyin.com[^ ]*)', string)
  36. logger.debug(f'analyze_url:{string} re:{res if not res else res.groups()}')
  37. if not res:
  38. logger.info(f"格式错误 {string} ,支持以下链接形式:\n""""
  39. - 分享主页: 长按复制此条消息,打开抖音搜索,查看TA的更多作品。 https://v.douyin.com/iRMFSx59/
  40. - 直播分享: 3- #在抖音,记录美好生活#【麦穗儿🦋129】正在直播,来和我一起支持Ta吧。复制下方链接,打开【抖音】,直接观看直播! https://v.douyin.com/iRMFCkqF/ 1@5.com 12/29
  41. - 用户主页 https://www.douyin.com/user/MS4wLjABAAAAml99q0O4A4rk9SCLeVJXaWGi5e2pKxvQ1Oe0CjfTSHg
  42. - 直播间地址: https://live.douyin.com/363178125769?camera_id=0
  43. """)
  44. return
  45. url, subdomain = res.groups()
  46. ret = {}
  47. # 分享链接,短链接 https://v.douyin.com/iRMFSx59/
  48. if subdomain == 'v':
  49. ret = self.analyze_short_url(url)
  50. # 用户主页: https://www.douyin.com/user/MS4wLjA...
  51. elif subdomain == 'www':
  52. ret = self.analyze_homepage_url(url)
  53. # res.groups()[0] = https://live.douyin.com/363178125769?camera_id=0 或 https://live.douyin.com/192990287232
  54. elif subdomain == 'live':
  55. ret = self.analyze_live_url(url, check=True)
  56. logger.debug(f"ret:{ret}")
  57. return ret
  58. '''
  59. share_url:
  60. - 分享地址: 长按复制此条消息,打开抖音搜索,查看TA的更多作品。 https://v.douyin.com/iRMFSx59/
  61. - 直播分享: 3- #在抖音,记录美好生活#【麦穗儿🦋129】正在直播,来和我一起支持Ta吧。复制下方链接,打开【抖音】,直接观看直播! https://v.douyin.com/iRMFCkqF/ 1@5.com 12/29
  62. return:
  63. - {"homepage": "https://www.douyin.com/user/MS4wLjABAAAAml99q0O4A4rk9SCLeVJXaWGi5e2pKxvQ1Oe0CjfTSHg"}
  64. - {"live_url": "https://live.douyin.com/363178125769"}
  65. - None
  66. '''
  67. def analyze_short_url(self, share_url):
  68. logger.debug(f'share_url: {share_url}')
  69. self.page.get(share_url)
  70. self.page.wait.load_complete()
  71. # 等待重定向
  72. time.sleep(1)
  73. if 'user' in self.page.url:
  74. return {"homepage": self.page.url, "name":self.get_user_name('span[class="Nu66P_ba"]')}
  75. elif 'live' in self.page.url:
  76. return {"live_url": self.page.url, "name":self.get_user_name('div[data-e2e="live-room-nickname"]')}
  77. def get_user_name(self,selector):
  78. html = HTML(html=self.page.html)
  79. name_ele = html.find(selector, first=True)
  80. if name_ele:
  81. return name_ele.text
  82. return ''
  83. '''
  84. url:
  85. - 直播间地址: https://live.douyin.com/363178125769?camera_id=0
  86. - a 标签地址: https://live.douyin.com/21289753259?enter_from_merge=web_others_homepage&ent...
  87. return:
  88. - if check: {"live_url": "https://live.douyin.com/363178125769"} (正在直播)
  89. - None
  90. - 直播间 url 格式正确,但是没有这个直播间,网页提示:该内容无法查看
  91. - url 正则表达式地址错误
  92. '''
  93. def analyze_live_url(self, url, check=False):
  94. search_res = re.search(r'(https?://live.douyin.com/[\d]+)', url)
  95. if search_res:
  96. live_url = search_res.group(0)
  97. live_page_user_name = {}
  98. if check:
  99. exist = self.check_live_exist(live_url)
  100. if not exist:
  101. logger.warning(f"live room not exist:{live_url}")
  102. return
  103. # 只有在check的时候会访问 live URL ,此时可以通过 data-e2e="live-room-nickname" 找到页面元素
  104. live_page_user_name = {"name":self.get_user_name('div[data-e2e="live-room-nickname"]')}
  105. ret = {"live_url": live_url}
  106. ret.update(live_page_user_name)
  107. return ret
  108. else:
  109. logger.warning(f"live link error:{live_url}")
  110. '''
  111. url
  112. - 用户主页 https://www.douyin.com/user/MS4wLjABAAAAml99q0O4A4rk9SCLeVJXaWGi5e2pKxvQ1Oe0CjfTSHg
  113. return
  114. - 正在直播 {"live_url": "https://live.douyin.com/363178125769", "name":"xxx", "homepage":"https://www.douyin.com/user/MS4wLjABAAAAml99q0O4A4rk9SCLeVJXaWGi5e2pKxvQ1Oe0CjfTSHg"}
  115. - 没有直播 {"live_url": "", "name":"xxx", "homepage":"https://www.douyin.com/user/MS4wLjABAAAAml99q0O4A4rk9SCLeVJXaWGi5e2pKxvQ1Oe0CjfTSHg"}
  116. '''
  117. def analyze_homepage_url(self, url):
  118. logger.debug(f'{url}')
  119. self.page.get(url)
  120. self.page.wait.load_complete()
  121. ret = {"name":self.get_user_name('span[class="Nu66P_ba"]'), "homepage":url}
  122. live_url = self.get_user_homepage_live_link(self.page.html)
  123. if live_url:
  124. ret.update(live_url)
  125. else:
  126. ret.update({"live_url": ""})
  127. logger.debug(f'ret {ret}')
  128. return ret
  129. '''
  130. html:
  131. - 用户主页 html(https://www.douyin.com/user/MS4wLjABAAAAml99q0O4A4rk9SCLeVJXaWGi5e2pKxvQ1Oe0CjfTSHg)
  132. return:
  133. - {'live_url': 'https://live.douyin.com/324749687667', 'name': 'xxx'} (正在直播)
  134. - None (没有直播)
  135. '''
  136. def get_user_homepage_live_link(self, html):
  137. logger.debug(f"{html[:10]}...")
  138. html = HTML(html=html)
  139. user = html.find('div[data-e2e="user-detail"]', first=True)
  140. if not user:
  141. path = self.save_html(html)
  142. logger.warning(f"Can not find <div[data-e2e=\"user-detail\"]> in user home page.Html save to:{path}")
  143. return
  144. a = user.find('a', first=True)
  145. # 如果找到 a 标签,说明正在直播
  146. if a:
  147. logger.debug(a.attrs['href'])
  148. # 因为 analyze_live_url 没有使用 check 参数,不会主动去访问 live URL, 此时还在 homepage 页面,需要查找 homepage 中的用户名元素: span[class="Nu66P_ba"]
  149. return self.analyze_live_url(a.attrs['href'])
  150. else:
  151. logger.info("The user is not live streaming")
  152. return {}
  153. '''是否存在这个直播间地址
  154. url:
  155. - https://live.douyin.com/363178125769
  156. return:
  157. - 1 | 0
  158. '''
  159. def check_live_exist(self, url):
  160. self.page.get(url)
  161. element = self.page.ele("xpath://div[@class='LV2pOyWA __leftContainer']")
  162. if element:
  163. return 1
  164. '''
  165. url:
  166. - https://live.douyin.com/363178125769
  167. return:
  168. - 0 直播已结束
  169. - 1 正常
  170. - 2 链接不存在,页面提示“该内容暂时无法查看” (链接出错)
  171. - 3 直播链接仅限手机观看(不影响录制)
  172. '''
  173. def check_live_url(self, url):
  174. self.page.get(url)
  175. self.page.wait.load_complete()
  176. html = HTML(html=self.page.html)
  177. basic_ele = html.find('div.basicPlayer', first=True)
  178. if not basic_ele:
  179. return 2
  180. if '结束' in basic_ele.text:
  181. return 0
  182. def save_html(self, content):
  183. directory = OUTPUT
  184. if not os.path.exists(directory):
  185. os.mkdir(directory)
  186. # 获取目录中的文件
  187. files = os.listdir(directory)
  188. # 如果文件数量超过最大值,则删除最旧的文件
  189. if len(files) >= 30:
  190. files.sort(key=lambda x: os.path.getmtime(os.path.join(directory, x)))
  191. os.remove(os.path.join(directory, files[0]))
  192. # 保存新的文件
  193. filename = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S_%f")[:-3] + '.html'
  194. file_path = os.path.join(OUTPUT, filename)
  195. with open((file_path), 'w') as f:
  196. f.write(content)
  197. return file_path
  198. def main():
  199. d = Detection()
  200. # logger.info(d.analyze_link("https://live.douyin.com/324749687667"))
  201. # logger.info(d.analyze_link("https://www.douyin.com/user/MS4wLjABAAAAtMD22LJGHKwwo1V1WR3Rh5KOUs1C4Jk0Fl5EWzPzuOhbCbC2yUHK9vuPu7nZ_rm4"))
  202. # logger.info(d.analyze_live_url("https://live.douyin.com/3651787257", check=True))
  203. # logger.info(d.analyze_live_url("https://v.douyin.com/3651787257", check=True))
  204. if __name__ == "__main__":
  205. main()