1.获取网页内容
public Document urlToDoc(String url) {
Document doc = null;
CloseableHttpClient httpclient = HttpClients.createDefault();
HttpGet httpget = new HttpGet(url);
CloseableHttpResponse response = null;
RequestConfig requestconfig = RequestConfig.custom()
.setSocketTimeout(50000).setConnectTimeout(50000)
.setCookieSpec(CookieSpecs.BEST_MATCH).build();
httpget.setConfig(requestconfig);
try {
response = httpclient.execute(httpget);
int statusCode = response.getStatusLine().getStatusCode();
if (statusCode != HttpStatus.SC_OK) {
System.err
.println("Method failed: " + response.getStatusLine());
}
HttpEntity entity = response.getEntity();
if (entity != null) {
InputStream in = entity.getContent();
doc = Jsoup.parse(in, "utf-8", url);
}
} catch (ClientProtocolException e) {
e.printStackTrace();
return null;
} catch (IOException e) {
e.printStackTrace();
return null;
} finally {
try {
Thread.sleep(50);
response.close();
httpclient.close();
} catch (IOException e) {
e.printStackTrace();
return null;
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
return null;
}
}
return doc;
}
2.获取标签内信息
Document doc = urlToDoc(url);
doc.select(".className").text();//获取标签class=className里的内容
doc.select(".idName").text();//获取标签id=idName里的内容
doc.select("标签").text();//获取标签里的内容
doc.select("").html();//将获取的内容转换成String
String attr=doc.select(".className #idName src").attr("src");
//获取class=className下的id=idName下的标签里属性src的值
String first=doc.select("a").get(1).text();
//获取第一个标签里的内容
3.完
Document还有好多方法,我还有好多没有用到,用到了再补充吧。






ssss
ssss