脚本之家,脚本语言编程技术及教程分享平台!
分类导航

Python|VBS|Ruby|Lua|perl|VBA|Golang|PowerShell|Erlang|autoit|Dos|bat|

服务器之家 - 脚本之家 - Python - python 爬取影视网站下载链接

python 爬取影视网站下载链接

2021-11-16 12:04GriffinLewis2001 Python

一个简单的爬取影视网站下载链接的爬虫,非常适合新手学习,感兴趣的朋友可以参考下

 

项目地址:

https://github.com/GriffinLewis2001/Python_movie_links_scraper

 

运行效果

python 爬取影视网站下载链接

python 爬取影视网站下载链接

 

导入模块

?
1
2
3
4
5
6
import requests,re
from requests.cookies import RequestsCookieJar
from fake_useragent import UserAgent
import os,pickle,threading,time
import concurrent.futures
from goto import with_goto

 

爬虫主代码

  1. def get_content_url_name(url): 
  2.     send_headers = { 
  3.      "User-Agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"
  4.     "Connection""keep-alive"
  5.     "Accept""text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8"
  6.     "Accept-Language""zh-CN,zh;q=0.8" 
  7.  
  8.         } 
  9.     cookie_jar = RequestsCookieJar() 
  10.     cookie_jar.set("mttp""9740fe449238", domain="www.yikedy.co"
  11.     response=requests.get(url,send_headers,cookies=cookie_jar) 
  12.     response.encoding='utf-8' 
  13.     content=response.text 
  14.     reg=re.compile(r'<a href="(.*?)" rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  class="thumbnail-img" title="(.*?)"'
  15.     url_name_list=reg.findall(content) 
  16.     return url_name_list 
  17.  
  18. def get_content(url): 
  19.     send_headers = { 
  20.      "User-Agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"
  21.     "Connection""keep-alive"
  22.     "Accept""text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8"
  23.     "Accept-Language""zh-CN,zh;q=0.8" 
  24.  
  25.         } 
  26.     cookie_jar = RequestsCookieJar() 
  27.     cookie_jar.set("mttp""9740fe449238", domain="www.yikedy.co"
  28.     response=requests.get(url,send_headers,cookies=cookie_jar) 
  29.     response.encoding='utf-8' 
  30.     return response.text 
  31.  
  32.  
  33.  
  34. def search_durl(url): 
  35.     content=get_content(url) 
  36.     reg=re.compile(r"{'decriptParam':'(.*?)'}"
  37.     index=reg.findall(content)[0] 
  38.     download_url=url[:-5]+r'/downloadList?decriptParam='+index 
  39.     content=get_content(download_url) 
  40.     reg1=re.compile(r'title=".*?" href="(.*?)" rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow" '
  41.     download_list=reg1.findall(content) 
  42.     return download_list 
  43. def get_page(url): 
  44.     send_headers = { 
  45.      "User-Agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"
  46.     "Connection""keep-alive"
  47.     "Accept""text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8"
  48.     "Accept-Language""zh-CN,zh;q=0.8" 
  49.  
  50.         } 
  51.     cookie_jar = RequestsCookieJar() 
  52.     cookie_jar.set("mttp""9740fe449238", domain="www.yikedy.co"
  53.     response=requests.get(url,send_headers,cookies=cookie_jar) 
  54.     response.encoding='utf-8' 
  55.     content=response.text 
  56.     reg=re.compile(r'<a target="_blank" class="title" href="(.*?)" rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  title="(.*?)">(.*?)</a>'
  57.     url_name_list=reg.findall(content) 
  58.     return url_name_list 
  59. @with_goto 
  60. def main(): 
  61.  
  62.     print("========================================================="
  63.     name=input("请输入剧名(输入quit退出):"
  64.     if name == "quit"
  65.         exit() 
  66.     url="http://www.yikedy.co/search?query="+name 
  67.     dlist=get_page(url) 
  68.     print(" "
  69.     if(dlist): 
  70.         num=0 
  71.         count=0 
  72.         for i in dlist: 
  73.             if (name in i[1]) : 
  74.                 print(f"{num} {i[1]}"
  75.                 num+=1 
  76.             elif num==0 and count==len(dlist)-1: 
  77.                 goto .end 
  78.             count+=1 
  79.         dest=int(input(" 请输入剧的编号(输100跳过此次搜寻):")) 
  80.         if dest == 100: 
  81.             goto .end 
  82.         x=0 
  83.         print(" 以下为下载链接: "
  84.         for i in dlist: 
  85.             if (name in i[1]): 
  86.                 if(x==dest): 
  87.                     for durl in search_durl(i[0]): 
  88.                         print(f"{durl} "
  89.  
  90.                     print(" "
  91.  
  92.                     break 
  93.                 x+=1 
  94.  
  95.     else
  96.         label .end 
  97.         print("没找到或不想看 "

完整代码

  1. import requests,re 
  2. from requests.cookies import RequestsCookieJar 
  3. from fake_useragent import UserAgent 
  4. import os,pickle,threading,time 
  5. import concurrent.futures 
  6. from goto import with_goto 
  7.  
  8. def get_content_url_name(url): 
  9.     send_headers = { 
  10.      "User-Agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"
  11.     "Connection""keep-alive"
  12.     "Accept""text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8"
  13.     "Accept-Language""zh-CN,zh;q=0.8" 
  14.  
  15.         } 
  16.     cookie_jar = RequestsCookieJar() 
  17.     cookie_jar.set("mttp""9740fe449238", domain="www.yikedy.co"
  18.     response=requests.get(url,send_headers,cookies=cookie_jar) 
  19.     response.encoding='utf-8' 
  20.     content=response.text 
  21.     reg=re.compile(r'<a href="(.*?)" rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  class="thumbnail-img" title="(.*?)"'
  22.     url_name_list=reg.findall(content) 
  23.     return url_name_list 
  24.  
  25. def get_content(url): 
  26.     send_headers = { 
  27.      "User-Agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"
  28.     "Connection""keep-alive"
  29.     "Accept""text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8"
  30.     "Accept-Language""zh-CN,zh;q=0.8" 
  31.  
  32.         } 
  33.     cookie_jar = RequestsCookieJar() 
  34.     cookie_jar.set("mttp""9740fe449238", domain="www.yikedy.co"
  35.     response=requests.get(url,send_headers,cookies=cookie_jar) 
  36.     response.encoding='utf-8' 
  37.     return response.text 
  38.  
  39.  
  40.  
  41. def search_durl(url): 
  42.     content=get_content(url) 
  43.     reg=re.compile(r"{'decriptParam':'(.*?)'}"
  44.     index=reg.findall(content)[0] 
  45.     download_url=url[:-5]+r'/downloadList?decriptParam='+index 
  46.     content=get_content(download_url) 
  47.     reg1=re.compile(r'title=".*?" href="(.*?)" rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow" '
  48.     download_list=reg1.findall(content) 
  49.     return download_list 
  50. def get_page(url): 
  51.     send_headers = { 
  52.      "User-Agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"
  53.     "Connection""keep-alive"
  54.     "Accept""text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8"
  55.     "Accept-Language""zh-CN,zh;q=0.8" 
  56.  
  57.         } 
  58.     cookie_jar = RequestsCookieJar() 
  59.     cookie_jar.set("mttp""9740fe449238", domain="www.yikedy.co"
  60.     response=requests.get(url,send_headers,cookies=cookie_jar) 
  61.     response.encoding='utf-8' 
  62.     content=response.text 
  63.     reg=re.compile(r'<a target="_blank" class="title" href="(.*?)" rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  title="(.*?)">(.*?)</a>'
  64.     url_name_list=reg.findall(content) 
  65.     return url_name_list 
  66. @with_goto 
  67. def main(): 
  68.  
  69.     print("========================================================="
  70.     name=input("请输入剧名(输入quit退出):"
  71.     if name == "quit"
  72.         exit() 
  73.     url="http://www.yikedy.co/search?query="+name 
  74.     dlist=get_page(url) 
  75.     print(" "
  76.     if(dlist): 
  77.         num=0 
  78.         count=0 
  79.         for i in dlist: 
  80.             if (name in i[1]) : 
  81.                 print(f"{num} {i[1]}"
  82.                 num+=1 
  83.             elif num==0 and count==len(dlist)-1: 
  84.                 goto .end 
  85.             count+=1 
  86.         dest=int(input(" 请输入剧的编号(输100跳过此次搜寻):")) 
  87.         if dest == 100: 
  88.             goto .end 
  89.         x=0 
  90.         print(" 以下为下载链接: "
  91.         for i in dlist: 
  92.             if (name in i[1]): 
  93.                 if(x==dest): 
  94.                     for durl in search_durl(i[0]): 
  95.                         print(f"{durl} "
  96.  
  97.                     print(" "
  98.  
  99.                     break 
  100.                 x+=1 
  101.  
  102.     else
  103.         label .end 
  104.         print("没找到或不想看 "
  105.  
  106. print("本软件由CLY.所有 "
  107. while(True): 
  108.     main() 

以上就是python 爬取影视网站下载链接的详细内容,更多关于python 爬取下载链接的资料请关注服务器之家其它相关文章!

原文链接:https://github.com/GriffinLewis2001/Python_movie_links_scraper

延伸 · 阅读

精彩推荐