1、使用工具:
Python3.5
BeautifulSoup
2、抓取网站:
csdn热门文章列表 http://blog.csdn.net/hot.html
3、实现代码:
__author__ = 'Administrator' import urllib.request import re from bs4 import BeautifulSoup ######################################################## # # 抓取csdn首页文章http://blog.csdn.net/?&page=1 # # # ######################################################## class CsdnUtils(object): def __init__(self): user_agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36' self.headers = {'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'User-Agent': user_agent, } def getPage(self, url=None): request = urllib.request.Request(url, headers=self.headers) response = urllib.request.urlopen(request) soup = BeautifulSoup(response.read(), "html.parser") #print(soup.prettify()) return soup def parsePage(self, url=None, page=None): soup = self.getPage(url) itemBlog = soup.find_all('div', 'blog_list') cnArticle = CsdnUtils print("========================第", page, "页======================================") for i, itemSingle in enumerate(itemBlog): cnArticle.num = i cnArticle.author = itemSingle.find('a', 'user_name').string cnArticle.postTime = itemSingle.find('span', 'time').string cnArticle.articleView = itemSingle.find('a', 'view').string if itemSingle.find('h1').find('a').has_attr('class'): cnArticle.type = itemSingle.find('h1').find('a', 'category').string else: cnArticle.type = "None" cnArticle.title = itemSingle.find('h1').find('a', attrs={'name': True}).string cnArticle.url = itemSingle.find('h1').find('a', attrs={'name': True}).get("href") print("数据:", cnArticle.num + 1, '\t', cnArticle.author, '\t', cnArticle.postTime, '\t', cnArticle.articleView, '\t', cnArticle.type, '\t', cnArticle.title, '\t', cnArticle.url) ####### 执行入口 ######## if __name__ == "__main__": #要抓取的网页地址'http://blog.csdn.net/?&page={}'.format(i+1),i+1) url = "http://blog.csdn.net/hot.html" cnblog = CsdnUtils() for i in range(0, 5): cnblog.parsePage(url, i + 1)
4、执行结果: