服务器之家:专注于服务器技术及软件下载分享
分类导航

PHP教程|ASP.NET教程|Java教程|ASP教程|编程技术|正则表达式|C/C++|IOS|C#|Swift|Android|VB|R语言|JavaScript|易语言|vb.net|

服务器之家 - 编程语言 - Java教程 - java实现登录之后抓取数据

java实现登录之后抓取数据

2021-05-19 14:04最是那一低头的温柔 Java教程

这篇文章给大家分享了用JAVA实现在登陆以后抓取网站的数据的相关知识,有兴趣的朋友可以测试参考下。

最近做了一个从网络上抓取数据的一个小程序。主要关于信贷方面,收集的一些黑名单网站,从该网站上抓取到自己系统中。

也找了一些资料,觉得没有一个很好的,全面的例子。因此在这里做个笔记提醒自己。

首先需要一个jsoup的jar包,我用的1.6.0。。下载地址为:http://pan.baidu.com/s/1mgqouha

1,获取网页内容(核心代码,技术有限没封装)。

2,登录之后抓取网页数据(如何在请求中携带cookie)。

3,获取网站的ajax请求方法(返回json)。

以上这三点我就用一个类全部包含(比较糙望见谅,直接copy代码过去,应该就可以用)

一,这个类分别有这上面的1,2,3三中方法,直接main方法可以进行测试

java" id="highlighter_454577">
?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
package com.minxinloan.black.web.utils;
 
import java.io.bufferedreader;
import java.io.bytearrayoutputstream;
import java.io.datainputstream;
import java.io.dataoutputstream;
import java.io.file;
import java.io.fileoutputstream;
import java.io.filewriter;
import java.io.ioexception;
import java.io.inputstream;
import java.io.inputstreamreader;
import java.io.outputstream;
import java.io.printwriter;
import java.net.httpurlconnection;
import java.net.url;
import java.net.urlconnection;
import java.net.urlencoder;
import java.nio.charset.charset;
import java.util.arraylist;
import java.util.hashmap;
import java.util.iterator;
import java.util.list;
import java.util.map;
import java.util.map.entry;
import java.util.stringtokenizer;
 
import net.sf.json.jsonarray;
import net.sf.json.jsonobject;
 
import org.jsoup.connection;
import org.jsoup.connection.method;
import org.jsoup.jsoup;
import org.jsoup.nodes.document;
import org.jsoup.nodes.element;
import org.jsoup.select.elements;
 
public class cookieutil {
 
  public final static string content_type = "content-type";
 
  public static void main(string[] args) {
    
    //string loginurl = "http://www.p2peye.com/member.php?mod=logging&action=login&loginsubmit=yes&loginhash=lsc66&username=puqiuxiaomao&password=a1234567";
    string listurl = "http://www.p2peye.com/blacklist.php?p=2";
    string logurl = "http://www.p2peye.com/member.php";
 
 
    //********************************需要登录的*************************************************
    try {
        connection.response res =
            jsoup.connect(logurl)
              .data("mod","logging"
                  ,"action","login"
                  ,"loginsubmit","yes"
                  ,"loginhash","lsc66"
                  ,"username","puqiuxiaomao"
                  ,"password","a1234567")
              .method(method.post)
              .execute();
        
        
        //这儿的sessionid需要根据要登录的目标网站设置的session cookie名字而定
        connection con=jsoup.connect(listurl);
        //设置访问形式(电脑访问,手机访问):直接百度都参数设置
        con.header("user-agent", "mozilla/4.0 (compatible; msie 7.0; windows nt 5.1)");
        //把登录信息的cookies保存如map对象里面
        map <string,string> map=res.cookies();
        iterator<entry<string,string>> it =map.entryset().iterator();
        while(it.hasnext()){
          entry<string,string> en= it.next();
          //把登录的信息放入请求里面
          con =con.cookie(en.getkey(), en.getvalue());
          
        }
        //再次获取document对象。
        document objectdoc = con.get();
        
        elements elements = objectdoc.getallelements();//获取这个连接返回页面的源码内容(不是源码跟源码差不多)
        for (element element : elements) {
          //element是迭代出来的标签:如:<div><span></span></div>
          elements elements2= element.getallelements();//
           for (element element2 : elements2) {
             element2.text();
             element2.attr("href");//获取标签属性。element2代表a标签:href代表属性
             element2.text();//获取标签文本
          }
        }
        
        //********************************不需要登录的*************************************************
        
        string url = "http://www.p2peye.com/blacklist.php?p=2";
        document contemp = jsoup.connect(url).get();
        elements elementstemps = contemp.getallelements();
         for (element elementstemp : elementstemps) {
           elementstemp.text();
           elementstemp.attr("href");//获取标签属性。element2代表a标签:href代表属性
           elementstemp.text();//获取标签文本
        }
        
        
        //********************************ajax方法获取内容。。。*************************************************。
         httpurlconnection connection = null;
          bufferedreader reader = null;
          try {
            stringbuffer sb = new stringbuffer();
            url geturl = new url(url);
            connection = (httpurlconnection)geturl.openconnection();
            reader = new bufferedreader(new inputstreamreader(
                connection.getinputstream(),"utf-8"));
            string lines;
            while ((lines = reader.readline()) != null) {
              sb.append(lines);
            };
            list<map<string, object>> list = parsejson2list(sb.tostring());//json转换成list
          } catch (exception e) {
            
          } finally{
            if(reader!=null)
              try {
                reader.close();
              } catch (ioexception e) {
              }
            // 断开连接
            connection.disconnect();
          }
        
    } catch (ioexception e) {
      // todo auto-generated catch block
      e.printstacktrace();
    }
    
  }
  
 
  public static map<string, object> parsejson2map(string jsonstr){
    map<string, object> map = new hashmap<string, object>();
    //最外层解析
    jsonobject json = jsonobject.fromobject(jsonstr);
    for(object k : json.keyset()){
      object v = json.get(k); 
      //如果内层还是数组的话,继续解析
      if(v instanceof jsonarray){
        list<map<string, object>> list = new arraylist<map<string,object>>();
        iterator<jsonobject> it = ((jsonarray)v).iterator();
        while(it.hasnext()){
          jsonobject json2 = it.next();
          list.add(parsejson2map(json2.tostring()));
        }
        map.put(k.tostring(), list);
      } else {
        map.put(k.tostring(), v);
      }
    }
    return map;
  }
  
  public static list<map<string, object>> parsejson2list(string jsonstr){
    jsonarray jsonarr = jsonarray.fromobject(jsonstr);
    list<map<string, object>> list = new arraylist<map<string,object>>();
    iterator<jsonobject> it = jsonarr.iterator();
    while(it.hasnext()){
      jsonobject json2 = it.next();
      list.add(parsejson2map(json2.tostring()));
    }
    return list;
  }
  
  
 
}

二,这个是获取验证码的类,可以研究下。(但你要要分析出网站的验证码的请求地址)

?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
package com.minxinloan.black.web.utils;
 
import java.io.bufferedreader;
import java.io.datainputstream;
import java.io.dataoutputstream;
import java.io.file;
import java.io.fileoutputstream;
import java.io.filewriter;
import java.io.inputstream;
import java.io.inputstreamreader;
import java.io.printwriter;
import java.net.httpurlconnection;
import java.net.url;
import java.net.urlconnection;
import java.nio.charset.charset;
import java.util.hashmap;
import java.util.list;
import java.util.map;
import java.util.stringtokenizer;
 
public class utils {//解析验证码的
  public static content getrandom(string method, string surl,// 要解析的url
      map<string, string> parammap, // 存放用户名和密码的map
      map<string, string> requestheadermap,// 存放cookie的map
      boolean isonlyreturnheader, string path) {
 
    content content = null;
    httpurlconnection httpurlconnection = null;
    inputstream in = null;
    try {
      url url = new url(surl);
      boolean ispost = "post".equals(method);
      if (method == null
          || (!"get".equalsignorecase(method) && !"post"
              .equalsignorecase(method))) {
        method = "post";
      }
      url resolvedurl = url;
      urlconnection urlconnection = resolvedurl.openconnection();
      httpurlconnection = (httpurlconnection) urlconnection;
      httpurlconnection.setrequestmethod(method);
      httpurlconnection.setrequestproperty("accept-language",
          "zh-cn,zh;q=0.5");
      // do not follow redirects, we will handle redirects ourself
      httpurlconnection.setinstancefollowredirects(false);
      httpurlconnection.setdooutput(true);
      httpurlconnection.setdoinput(true);
      httpurlconnection.setconnecttimeout(5000);
      httpurlconnection.setreadtimeout(5000);
      httpurlconnection.setusecaches(false);
      httpurlconnection.setdefaultusecaches(false);
      httpurlconnection.connect();
 
      int responsecode = httpurlconnection.getresponsecode();
 
      if (responsecode == httpurlconnection.http_ok
          || responsecode == httpurlconnection.http_created) {
        byte[] bytes = new byte[0];
        if (!isonlyreturnheader) {
          datainputstream ins = new datainputstream(
              httpurlconnection.getinputstream());
          // 验证码的位置
          dataoutputstream out = new dataoutputstream(
              new fileoutputstream(path + "/code.bmp"));
          byte[] buffer = new byte[4096];
          int count = 0;
          while ((count = ins.read(buffer)) > 0) {
            out.write(buffer, 0, count);
          }
          out.close();
          ins.close();
        }
        string encoding = null;
        if (encoding == null) {
          encoding = getencodingfromcontenttype(httpurlconnection
              .getheaderfield(""));
        }
        content = new content(surl, new string(bytes, encoding),
            httpurlconnection.getheaderfields());
      }
    } catch (exception e) {
      return null;
    } finally {
      if (httpurlconnection != null) {
        httpurlconnection.disconnect();
      }
    }
    return content;
  }
 
  public static string getencodingfromcontenttype(string contenttype) {
    string encoding = null;
    if (contenttype == null) {
      return null;
    }
    stringtokenizer tok = new stringtokenizer(contenttype, ";");
    if (tok.hasmoretokens()) {
      tok.nexttoken();
      while (tok.hasmoretokens()) {
        string assignment = tok.nexttoken().trim();
        int eqidx = assignment.indexof('=');
        if (eqidx != -1) {
          string varname = assignment.substring(0, eqidx).trim();
          if ("charset".equalsignorecase(varname)) {
            string varvalue = assignment.substring(eqidx + 1)
                .trim();
            if (varvalue.startswith("\"")
                && varvalue.endswith("\"")) {
              // substring works on indices
              varvalue = varvalue.substring(1,
                  varvalue.length() - 1);
            }
            if (charset.issupported(varvalue)) {
              encoding = varvalue;
            }
          }
        }
      }
    }
    if (encoding == null) {
      return "utf-8";
    }
    return encoding;
  }
 
  // 这个是输出
  public static boolean infile(string content, string path) {
    printwriter out = null;
    file file = new file(path);
    try {
      if (!file.exists()) {
        file.createnewfile();
      }
      out = new printwriter(new filewriter(file));
 
      out.write(content);
      out.flush();
      return true;
    } catch (exception e) {
      e.printstacktrace();
    } finally {
      out.close();
    }
    return false;
  }
 
  public static string gethtmlreadline(string httpurl) {
    string currentline = "";
    string totalstring = "";
    inputstream urlstream;
    string content = "";
 
    try {
      url url = new url(httpurl);
 
      httpurlconnection connection = (httpurlconnection) url
          .openconnection();
 
      connection.connect();
      system.out.println(connection.getresponsecode());
      urlstream = connection.getinputstream();
 
      bufferedreader reader = new bufferedreader(
 
      new inputstreamreader(urlstream, "utf-8"));
 
      while ((currentline = reader.readline()) != null) {
        totalstring += currentline + "\n";
      }
 
      content = totalstring;
 
    } catch (exception e) {
    }
 
    return content;
  }
}
 
 
class content {
  private string url;
  private string body;
  private map<string, list<string>> m_mheaders = new hashmap<string, list<string>>();
 
  public content(string url, string body, map<string, list<string>> headers) {
    this.url = url;
    this.body = body;
    this.m_mheaders = headers;
  }
 
  public string geturl() {
    return url;
  }
 
  public string getbody() {
    return body;
  }
 
  public map<string, list<string>> getheaders() {
    return m_mheaders;
  }
 
}

原文链接:https://blog.csdn.net/HUXU981598436/article/details/79134920

延伸 · 阅读

精彩推荐