频道栏目
首页 > 资讯 > Python > 正文

python google play

13-08-01        来源:[db:作者]  
收藏   我要投稿
#!/usr/env  python
#-*- coding: utf-8  -*-
import urllib
import urllib2
import random
import requests
import os,sys
import MySQLdb
from sgmllib import SGMLParser
from BeautifulSoup import BeautifulSoup
import re
num=0
def main():
 try:
  conn=MySQLdb.connect(host='localhost',user='root',passwd='123456',db='googlemarket',charset="utf8")
  conn.query("set names utf8")
 except Exception,e:
  print e
  sys.exit()
 cursor=conn.cursor()
 category=['PERSONALIZATION','TRANSPORTATION','SPORTS','HEALTH_AND_FITNESS','APP_WALLPAPER','COMICS','MEDICAL','BUSINESS','BOOKS_AND_REFERENCE','WEATHER','ENTERTAINMENT','MEDIA_AND_VIDEO','APP_WIDGETS','TOOLS','PHOTOGRAPHY','PRODUCTIVITY','EDUCATION','NEWS_AND_MAGAZINES','TRAVEL_AND_LOCAL','LIFESTYLE','SOCIAL','FINANCE','SHOPPING','LIBRARIES_AND_DEMO','COMMUNICATION','MUSIC_AND_AUDIO','GAME'] 
 for k in range(0,27):
  t="https://play.google.com/store/apps/category/"+category[k]
  html=requests.get(t)
  preresult=html.content
  soup=BeautifulSoup(preresult)
  result=soup.prettify("utf-8")
  pattern=re.compile('<a class="title" href="(.+?)" title')
  dataresult=re.findall(pattern,result)
  dataresult=list(set(dataresult))
  for i in dataresult:
   url="https://play.google.com"+i
   print url  
   #url="https://play.google.com/store/apps/details?id=com.androidesk&hl=zh_CNhttps%3A%2F%2Fplay.google.com%2Fstore%2Fapps%2Fdetails%3Fid%3Dcom.androidesk"
 
   html=requests.get(url)
   preresult=html.content
   soup=BeautifulSoup(preresult)
   result=soup.prettify("utf-8")
   #名称
   pattern=re.compile('<div class="document-title" itemprop="name">[\s\S]*?<div>([\s\S]*?)</div>')
   data0=re.findall(pattern,result)
   for items in data0:
    print items
   #制造商
   pattern=re.compile('itemprop="name">([\s\S]*?)</a>')
   data1=re.findall(pattern,result)
  
   make=data1[0].split("\n")
  
   print make[8]
   #版本
   pattern=re.compile('itemprop="softwareVersion">([\s\S]*?)</div>')
   data2=re.findall(pattern,result)
   print data2[0]
   #更新时间
   pattern=re.compile('itemprop="datePublished">([\s\S]*?)</div>')
   data3=re.findall(pattern,result)
   print data3[0]
   #文件大小
   pattern=re.compile('itemprop="fileSize">([\s\S]*?)</div>')
   data4=re.findall(pattern,result)
   print data4[0]
   #支持固件
   pattern=re.compile('itemprop="operatingSystems">([\s\S]*?)</div>')
   data5=re.findall(pattern,result)
   print data5[0]
   #说明
   pattern=re.compile('itemprop="description">[\s\S]*?<div>([\s\S]*?)</div>')
   data6=re.findall(pattern,result)
   for items in data6:
    print re.sub('[<br /> <p> </p>]',' ',items)
   sql="insert into address(name,version,developer,pubtime,filesize,support,introduction) values(%s,%s,%s,%s,%s,%s,%s)"
   for items in data6:
   
    if(data5):
     #values=(data0[0],data1[0],data2[0],data3[0],data4[0],data5[0],re.sub('<br />',' ',items))
    #else:
     #values=(data0[0],data1[0],data2[0],data3[0],data4[0],'NULL',re.sub('<br />',' ',items))
    #print values
    #print sql % values
    #cursor.execute(sql,values)
    #conn.commit()
   pattern=re.compile('<img class="cover-image" src=(.+?) alt="Cover art" itemprop="image" />')
   data=re.findall(pattern,result)
   global num
   for j in data:
    print j
    print type(j)
    headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201'}
    temp=requests.get(j[1:-2], headers=headers)
   f=file("googlemarket/"+str(num),"w+")
   num=num+1
   print num
   f.write(temp.content)
 
   
  
    
 
   
if  __name__=="__main__":
       main()
<type 'str'>
Traceback (most recent call last):
  File "crawler0729.py", line 103, in <module>
    main()
  File "crawler0729.py", line 91, in main
    temp=requests.get(j[1:-2], headers=headers)
  File "/usr/local/lib/python2.7/dist-packages/requests/api.py", line 55, in get
    return request('get', url, **kwargs)
  File "/usr/local/lib/python2.7/dist-packages/requests/api.py", line 44, in request
    return session.request(method=method, url=url, **kwargs)
  File "/usr/local/lib/python2.7/dist-packages/requests/sessions.py", line 335, in request
    resp = self.send(prep, **send_kwargs)
  File "/usr/local/lib/python2.7/dist-packages/requests/sessions.py", line 438, in send
    r = adapter.send(request, **kwargs)
  File "/usr/local/lib/python2.7/dist-packages/requests/adapters.py", line 327, in send
    raise ConnectionError(e)
requests.exceptions.ConnectionError: HTTPSConnectionPool(host='lh3.ggpht.com', port=443): Max retries exceeded with url: /RBld17rLw4Ik0JtOaKk4bZB2RiGJ2R8H5Q8Rjw3Hh6BAM694fOzzKj1TJFr7R02ZS_40=w30 (Caused by <class 'socket.error'>: [Errno 101] Network is unreachable)
相关TAG标签
上一篇:Eclipse中tomcat修改端口号
下一篇:Android 监听程序的安装和卸载
相关文章
图文推荐

关于我们 | 联系我们 | 广告服务 | 投资合作 | 版权申明 | 在线帮助 | 网站地图 | 作品发布 | Vip技术培训 | 举报中心

版权所有: 红黑联盟--致力于做实用的IT技术学习网站