频道栏目
首页 > 程序开发 > Web开发 > Python > 正文
python爬虫记录(一)
2017-08-26 09:34:00         来源:chen  
收藏   我要投稿

最近在学习python以及利用python爬虫,公司项目需要:

爬虫并入库:代码如下

过程碰到最多的就Python2的编码问题,太烦人了。

#!/user/bin/python
# -*- coding: UTF-8 -*-

import urllib
import urllib2
import lxml
import re
import MySQLdb
import time
from bs4 import BeautifulSoup

import httplib
httplib.HTTPConnection._http_vsn = 10
httplib.HTTPConnection._http_vsn_str = 'HTTP/1.1'

user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.109 Safari/537.36'  
hdr = { 'User-Agent' : user_agent }

db = MySQLdb.connect(host="localhost", port=3306, user="root", passwd="123456", db="xiaoshuo", charset="utf8")
str_sql = '''INSERT INTO `xiaoshuo`.`book1` (`bookName`, `author`, `url`, `classifyName`, `brief`, `updateTime`,
            `status`) VALUES'''

def getBookInfoBaseOnUrl(url):

    global str_sql

    request = urllib2.Request(url, headers=hdr)
    response = urllib2.urlopen(request)
    html_data = response.read().decode('gbk')

    soup = BeautifulSoup(html_data,'lxml')
    mylist = soup.select('head')

    for item in mylist: 
        bookName = item.find(property="og:novel:book_name").get("content")
        print "书名:", bookName

        author = item.find(property="og:novel:author").get("content")
        print "作者:", author

        url = item.find(property="og:novel:read_url").get("content")
        print "链接:", url

        classifyName = item.find(property="og:novel:category").get("content")
        print "类型:", classifyName

        description = item.find(property="og:description").get("content")
        print "brief:", description

        updateTime = item.find(property="og:novel:update_time").get("content")
        print "更新时间:", updateTime

        status = item.find(property="og:novel:status").get("content")
        print "status:", status

        str_sql += '("' + bookName + '", "' + author + '", "' + url + '", "' + classifyName + '", "' + description + '", "' + updateTime + '", "' + status + '"),'
        
        print "-----------------------------------------------------------------------------------------"

def get_book( ):

    global str_sql
    cursor = db.cursor()
    #count = 0

    soup = BeautifulSoup(open('biquge.html'),'lxml')
    mylist = soup.find_all('p', class_ ='content')
    for item in mylist:
        #print item
        xiaoshuo_list = item.find_all('li')
        for item in xiaoshuo_list:
            #coutn2 = 0
            url = item.find('a').get('href')
            print "书的连接:" , url
            getBookInfoBaseOnUrl(url)

            #coutn2 = coutn2 + 1
            #if coutn2 == 2:
                #break
        #count = count + 1
        #if count == 2:
            #break

    for item in mylist:
        xiaoshuo_list = item.find_all('dl')
        for item in xiaoshuo_list:
            url = item.find('a').get('href')
            print "书的连接:", url
            getBookInfoBaseOnUrl(url)

    str_sql = str_sql.encode("utf-8")
    str_sql = str_sql[0:len(str_sql)-1]
    print "tmp_slq:", str_sql
    cursor.execute(str_sql)
    db.commit()
    cursor.close()

if __name__ == "__main__":
    print ("<<<-----Start Get Book INFO And Save Db------>>")
    get_book()

    db.close()
    print str_sql
点击复制链接 与好友分享!回本站首页
上一篇:理解 Python super()
下一篇:python爬虫记录(二)
相关文章
图文推荐
文章
推荐
点击排行

关于我们 | 联系我们 | 广告服务 | 投资合作 | 版权申明 | 在线帮助 | 网站地图 | 作品发布 | Vip技术培训 | 举报中心

版权所有: 红黑联盟--致力于做实用的IT技术学习网站