Python爬采集网站视频

python 2019年04月19日 17:48 Norsl 46 0

目标站点是 www.156zy.cc

爬下来的数据请勿用于商业用途!!!

一个url采集线程,10个解析线程(可更改)

数据库用户名自行更改脚本相应位置

脚本依赖第三方库,请提前装好(requests,pymysql,BeautifulSoup)

这个算得上是一个学习的demo吧,代码略丑,大牛勿笑

import threading
import requests
import re
import pymysql
import time
from bs4 import BeautifulSoup

host = "www.156zy.cc"
urlList = []
htmlList = []
db = {
    'host': '127.0.0.1',
    'user': 'root',
    'pass': 'root',
    'name': 'video'
}
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36'
}
status = True

# url采集线程
class GetUrlThread(threading.Thread):
    def __init__(self):
        threading.Thread.__init__(self)

    def run(self):
        html = getHtml("http://" + host)
        bs = BeautifulSoup(html, 'html.parser')
        pageBox = bs.find('div', class_ = 'pages')
        page = re.search(r'html\',(\d+)\)\"', str(pageBox)).group(1)
        for i in range(0, int(page)):
            pageUrl = "http://" + host + "/?m=vod-index-pg-" + str(i + 1) + ".html"
            content = getHtml(pageUrl)
            pageBs = BeautifulSoup(content, 'html.parser')
            detailList = pageBs.select(".xing_vb4 a")
            for item in detailList:
                urlList.append("http://" + host + item['href'])
        global status
        status = False



# 数据采集线程
class GetInfoThread(threading.Thread):
    def __init__(self, conn):
        threading.Thread.__init__(self)
        self.conn = conn
        self.mysql = conn.cursor()

    def run(self):
        while True:
            if not(urlList or status):
                print("线程:" + self.getName() + '执行结束')
                exit()
            if urlList:
                threadLock.acquire()
                url = urlList.pop(0)
                threadLock.release()
                try:
                    html = getHtml(url)
                except:
                    urlList.append(url)
                    continue
                bs = BeautifulSoup(html, 'html.parser')
                info = bs.select('.vodinfobox ul li')
                # 视频封面图
                if bs.select('.vodImg img'):
                    video_cover = bs.select('.vodImg img')[0]['src']
                else:
                    video_cover = ''

                # 视频名称
                if bs.select('.vodh h2'):
                    video_title = bs.select('.vodh h2')[0].text.replace('\'', '')
                else:
                    continue
                # 视频标签
                video_tag = bs.select('.vodh span')[0].text.replace('\'', '')
                # 别名
                video_cname = info[0].span.text.replace('\'', '')
                # 导演
                video_director = info[1].span.text.replace('\'', '')
                # 主演
                video_performer = info[2].span.text.replace('\'', '')
                # 视频类型
                video_type = info[3].span.text
                # 地区
                video_area = info[4].span.text.replace('\'', '')
                # 语言
                video_lang = info[5].span.text
                # 上映时间
                video_release_time = info[6].span.text
                # 视频简介
                video_info = bs.select('.vodplayinfo')[1].text.replace('#', '')
                # 剧集列表
                try:
                    block_list = re.findall(r'/>(.*?\.m3u8)</li>', html)
                except:
                    continue

                # 如果剧集为空则跳过
                if not block_list:
                    continue

                # 查询目标是否存在
                self.mysql.execute("select id from video where title='%s'" % (video_title))
                if self.mysql.rowcount:
                    vid = self.mysql.fetchone()[0]
                else:
                    try:
                        self.mysql.execute("insert into video(\
                                                    title,\
                                                    cover,\
                                                    ctitle,\
                                                    info,\
                                                    tag,\
                                                    director,\
                                                    performer,\
                                                    type,\
                                                    area,\
                                                    lang,\
                                                    release_time\
                                                ) \
                                      values('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s')" % \
                                      (video_title, \
                                       video_cover, \
                                       video_cname, \
                                       video_info, \
                                       video_tag, \
                                       video_director, \
                                       video_performer, \
                                       video_type, \
                                       video_area, \
                                       video_lang, \
                                       video_release_time \
                                       ))
                        self.conn.commit()
                        vid = self.mysql.lastrowid
                    except:
                        print("采集失败,已自动跳过...")
                        continue
                if not vid:
                    continue

                for item in block_list:
                    name, link = item.split('$')
                    # 通过查询判断当前剧集是否存在
                    self.mysql.execute("select id from block where url='%s' and vid=%d" % (link, vid))
                    if not self.mysql.rowcount:

                        try:
                            self.mysql.execute("insert into block(vid,name,url) values(%d,'%s','%s')" % (vid, name, link))
                            self.conn.commit()
                        except:
                            print(video_title + " " + name + " 采集失败,已自动跳过...")
                            continue
                        print(video_title + " " + name + " 已采集...")
                    else:
                        self.mysql.execute("update block set url='%s' where url='%s' and vid=%d" % (link, link, vid))
                        self.conn.commit()
                        print(video_title + " " + name + " 已更新链接...")
                threadLock.acquire()
                print("############################### " + str(len(urlList)) + "待采集 ###############################")
                threadLock.release()


###################################################
def getHtml(url):
    return requests.get(url, timeout = 30).text

# 线程锁
threadLock = threading.Lock()

def main():
    getUrlThread = GetUrlThread()
    getUrlThread.start()

    # 10个采集线程
    for i in range(10):
        conn = pymysql.connect(db['host'], db['user'], db['pass'], db['name'])
        thread = GetInfoThread(conn)
        thread.start()


main()


数据库结构

CREATE TABLE `video` (
  `id` int(11) NOT NULL AUTO_INCREMENT,
  `title` varchar(255) NOT NULL DEFAULT '',
  `cover` varchar(255) DEFAULT NULL,
  `ctitle` varchar(255) DEFAULT '',
  `info` mediumtext,
  `tag` varchar(255) DEFAULT NULL,
  `director` varchar(255) DEFAULT NULL,
  `performer` varchar(255) DEFAULT NULL,
  `type` varchar(255) DEFAULT NULL,
  `area` varchar(255) DEFAULT NULL,
  `lang` varchar(255) DEFAULT NULL,
  `release_time` varchar(255) DEFAULT NULL,
  `is` tinyint(1) NOT NULL DEFAULT '1',
  PRIMARY KEY (`id`),
  KEY `title` (`title`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;

CREATE TABLE `block` (
  `id` int(11) NOT NULL AUTO_INCREMENT,
  `vid` int(11) NOT NULL DEFAULT '0',
  `name` varchar(255) DEFAULT NULL,
  `url` varchar(255) NOT NULL DEFAULT '',
  PRIMARY KEY (`id`),
  KEY `vid` (`vid`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;


说点什么吗?

你的电子邮箱地址不会被公开。必填项已用 * 著名

*
*
验证码

Norsl の 个人名片

职业:web开发

现居:四川 成都

Git:https://gitee.com/norsl

邮箱:3136904131@qq.com

最新评论

  • Norsl : 当时都以为凉凉了,不然现在就是空库了
  • LILI : 有惊无险啊哈
  • 林三 : 原谅我每次过来都找不到地方说话,你哪些代码函数太深奥,我确实看不懂呢。
  • 热血学霸 : 感谢大佬分享,好人一生平安
  • 格瑞LILI : 你猜我是从哪看到你的博客的《-_-》
  • 格瑞LILI : 网站名称:格瑞LILI 网站地址:https://kaygb.top 网站描述:个人博客,分享学习生涯遇到的问题
  • zgcwkj : 给力啊!!!谢了
  • 妙文屋 : 写的很好,很喜欢
  • 马也随笔 : 这个记录评论者COOKIE功能挺实用的,有没有插件可以使用的,用的WP。
  • 林三 : 你厉害,我是追不上了,业务的太狠了。