频道栏目
首页 > 程序开发 > 综合编程 > 其他综合 > 正文
HttpClient模拟登陆人人网,并且爬取日志内容(一)
2012-04-28 14:14:06           
收藏   我要投稿

使用HttpClient最新版本,下载地址Downloadhttp://up.2cto.com/2012/0428/20120428021640813.zip
 

注释已经写的比较清楚了,就不再说明了。

[java]

 *
 *  Licensed to the Apache Software Foundation (ASF) under one or more
 *  contributor license agreements.  See the NOTICE file distributed with
 *  this work for additional information regarding copyright ownership.
 *  The ASF licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 * ====================================================================
 *
 * This software consists of voluntary contributions made by many
 * inpiduals on behalf of the Apache Software Foundation.  For more
 * information on the Apache Software Foundation, please see <http://www.apache.org/>.
 * renren.com
 * <input type="hidden" name="origURL" value="http://www.renren.com/home" />
 * <input type="hidden" name="domain" value="renren.com" />
 * <input type="hidden" name="key_id" value="1" />
 * <input type="submit" id="login" class="input-submit login-btn" value="登录人人网" tabindex="5"/>
 * http://s.xnimg.cn/a36853/n/apps/login/login-all.js
 */ 
package org.apache.http.examples.client; 
 
import java.util.ArrayList; 
import java.util.List; 
import java.util.regex.Matcher; 
import java.util.regex.Pattern; 
 
import org.apache.http.Header; 
import org.apache.http.HttpEntity; 
import org.apache.http.HttpResponse; 
import org.apache.http.NameValuePair; 
import org.apache.http.client.HttpClient; 
import org.apache.http.client.entity.UrlEncodedFormEntity; 
import org.apache.http.client.methods.HttpGet; 
import org.apache.http.client.methods.HttpPost; 
import org.apache.http.impl.client.DefaultHttpClient; 
import org.apache.http.impl.conn.tsccm.ThreadSafeClientConnManager; 
import org.apache.http.message.BasicNameValuePair; 
import org.apache.http.protocol.HTTP; 
import org.apache.http.util.EntityUtils; 
 
/**
 * 
 * Purpose:
 * 
 * @author: shihuangzhe.com
 * @since: JDK 1.6
 * @date: 2012-4-28
 * 
 */ 
public class RrLogin { 
    /** 帐号 */ 
    private static final String userName = "xxxxx@yahoo.com.cn"; 
    /** 密码 */ 
    private static final String password = "******"; 
    /** 网域 */ 
    private static final String domain = "renren.com"; 
    /** key_id */ 
    private static final String keyID = "1"; 
    /** 表单提交url */ 
    private static String loginURL = "http://www.renren.com/PLogin.do"; 
    /** 登陆成功后,跳转到我自己的blog日志,人人默认跳转路径为 http://www.renren.com/home */ 
    private static final String targetUrl = "http://blog.renren.com/blog/84082953/398292611"; 
    /** 表单域常量(跳转url) */ 
    private static final String _ORGI_URL = "origURL"; 
    /** 表单域常量(网域) */ 
    private static final String _DOMAIN = "domain"; 
    /** 表单域常量(key_id) */ 
    private static final String _KEY_ID = "key_id"; 
    /** 表单域常量(帐号) */ 
    private static final String _EMAIL = "email"; 
    /** 表单域常量(密码) */ 
    private static final String _PASSWORD = "password"; 
    /** ThreadSafeClientConnManager保证多线程安全 */ 
    private HttpClient client = new DefaultHttpClient( 
            new ThreadSafeClientConnManager()); 
 
    /**
     * Purpose: 登陆renren.com
     * 
     * @throws Exception
     * @return: void
     */ 
    private void login() throws Exception { 
        HttpPost httpost = new HttpPost(loginURL); 
        try { 
            // 为请求参数赋值 
            List<NameValuePair> nvps = new ArrayList<NameValuePair>(); 
            nvps.add(new BasicNameValuePair(_ORGI_URL, targetUrl)); 
            nvps.add(new BasicNameValuePair(_DOMAIN, domain)); 
            nvps.add(new BasicNameValuePair(_KEY_ID, keyID)); 
            nvps.add(new BasicNameValuePair(_EMAIL, userName)); 
            nvps.add(new BasicNameValuePair(_PASSWORD, password)); 
            httpost.setEntity(new UrlEncodedFormEntity(nvps, HTTP.UTF_8)); 
            // 获取请求相应 
            HttpResponse response = client.execute(httpost); 
            /*
             * 注意,因为renren.com登陆成功后,需要再次经过
             * http://www.renren.com/callback.do?t=da278e2526f9b2387ea22e57578a85d93
             * &
             * origURL=http%3A%2F%2Fblog.renren.com%2Fblog%2F84082953%2F398292611
             * &needNotify=false 这种方式跳转,所以需要再次处理发一次请求
             */ 
            Header locationHeader = response.getFirstHeader("Location"); 
            HttpGet httpget = new HttpGet(locationHeader.getValue()); 
            HttpResponse response2 = client.execute(httpget); 
            // 获取Entity 
            HttpEntity entity = response2.getEntity(); 
//           System.out.println(EntityUtils.toString(entity)); 
            String html = EntityUtils.toString(entity); 
            System.out.println(html); 
//          // 解析html,拿出blog 
            String context = printBlog(html); 
            System.out.println("---------解析后的内容----------- "); 
            System.out.println(context); 
        } finally { 
            // When HttpClient instance is no longer needed, 
            // shut down the connection manager to ensure 
            // immediate deallocation of all system resources 
            client.getConnectionManager().shutdown(); 
        } 
    } 
    /**
     * Purpose: 正则提取blog内容
     * @param orgTest
     * @return: String
     */ 
    private String printBlog(String orgTest) { 
        // 正则匹配规则 
//      String regexp = "<p\\s*id=\"blogContent\"\\s*[^>]*>(.+?)</p>"; 
//      String regexp = "(<p id=\"blogContent\" class=\"text-article\")(.+?)( </p>)"; 
        Pattern pattern = Pattern.compile("<p\\s*id=\"blogContent\"\\s*[^>]*>(.+?)</p>"); 
        Matcher m = pattern.matcher(orgTest); 
        if (!m.find()) { 
            return null; 
        } 
        return m.group(0); 
    } 
 
    public static void main(String[] args) throws Exception { 
        RrLogin renRen = new RrLogin(); 
        renRen.login(); 
    } 

目前,我爬取到renren.com我的主页的某一篇日志后,想使用java正则来解析,提取日志内容,但是现在有点问题,正则好像无法提取到。大家如果谁知道,可以给我一个建议。


摘自 昨日凡阳

点击复制链接 与好友分享!回本站首页
相关TAG标签 人人 内容 日志
上一篇:ActiveMQ消息收发简单例子
下一篇:Extjs应用tab页的最简单Demo
相关文章
图文推荐
文章
推荐
点击排行

关于我们 | 联系我们 | 广告服务 | 投资合作 | 版权申明 | 在线帮助 | 网站地图 | 作品发布 | Vip技术培训 | 举报中心

版权所有: 红黑联盟--致力于做实用的IT技术学习网站