抓取豆瓣电影代码
By skyshappiness Posted 2017-01-04 21:46:09 In

#!/user/bin/python

#encoding:utf-8

import MySQLdb

import urllib2

from bs4 import BeautifulSoup

import sys

reload(sys)

sys.setdefaultencoding("utf-8")

#查找一条数据

def findData(sql):

    db = MySQLdb.connect(charset="utf8", host="localhost", user="root", passwd="", db="blog")

    cusor = db.cursor()

    cusor.execute(sql)

    data = cusor.fetchone()

    db.close()

    return data

#插入/更新 一条数据

def insertData(sql):

    db = MySQLdb.connect(charset="utf8", host="localhost", user="root", passwd="", db="blog")

    cusor = db.cursor()

    cusor.execute(sql)

    db.commit()

    db.close()

#抓取页面

def grabContent(url):

    header = {

'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36',}

    req = urllib2.Request(url, headers = header)

    con = urllib2.urlopen(req)

    doc = con.read()

    con.close()

    return doc

#一切从零开始

def startFromZero():

    data = grabContent('http://movie.douban.com/tag/')

    soup = BeautifulSoup(data, 'html.parser')

    for url in soup.find_all('a',{'class':'tag'}):

        sqlStr = "INSERT INTO `blog_movie_tag` (tag) VALUES ('"+str(url.get_text())+"')"

        insertData(sqlStr)

    grabMovieInfo()

#开始抓取电影

def grabMovieInfo():

    tag = findData("SELECT `tag` from blog_movie_tag WHERE status = '0' LIMIT 1 ")

    for i in range(0,35):

        if i != 34:

            startNum = i * 15

        else:

            updateTagSql = "UPDATE `blog_movie_tag` SET status = '1' where tag = "+tag[0]

            insertData(updateTagSql)

            startNum = 500

        url = "http://www.douban.com/tag/"+tag[0]+"/movie?start="+str(startNum)

        i += 1

        data = grabContent(url)

        soup = BeautifulSoup(data, 'html.parser')

        for url in soup.find_all('dl'):

            movieName = str(url.find('a',{'class':'title'}).get_text()).replace("'", "")

            movieYear = str(url.find('div',{'class':'desc'}).get_text())

            oldDataSql = "SELECT `id` FROM `blog_movie` WHERE movie_name = '"+movieName+"'"

            existId = findData(oldDataSql)

            if(existId == None):

                movieSql = "INSERT INTO `blog_movie` (movie_name, movie_year) VALUES ('"+movieName+"','"+movieYear+"')"

                insertData(movieSql)

                grabMovieTag(url.find('a', {'class':'title'}).get('href'), movieName)

#抓取电影详情页的标签

def grabMovieTag(url, movieName):

    data = grabContent(url)

    soup = BeautifulSoup(data, 'html.parser')

    movieRate = soup.find('strong', {'class':'rating_num'}).get_text()

    updateMovieSql = "UPDATE `blog_movie` SET rate = '"+movieRate+"' WHERE movie_name = '"+movieName+"'"

    insertData(updateMovieSql)

    i=0

    for grabTag in soup.find('div', {'class':'tags-body'}).find_all_next('a'):

        i = i+1

        if(i < 9):

            oldDataSql = "SELECT `id` FROM `blog_movie_tag` WHERE tag = '" + str(grabTag.get_text()) + "'"

            existId = findData(oldDataSql)

            if(existId == None):

                dataSql = "INSERT INTO `blog_movie_tag` (tag) VALUES ('" + str(grabTag.get_text()) + "')"

                insertData(dataSql)

        else:

            break

startFromZero()

友情链接
联系方式
  • 邮箱 / E-mail:skyshappiness@gmail.com