当前所在位置:珠峰网资料 >> 计算机 >> 计算机等级考试 >> 正文
Java获取网络文件并插入数据库
发布时间:2011/1/9 13:58:23 来源:城市学习网 编辑:ziteng

  抓取各大网站的数据插入数据库,这样就不用为没有数据而烦恼了

  获取百度的歌曲名,歌手和链接!!

  package webTools;

  import java.io.BufferedReader;

  import java.io.IOException;

  import java.io.InputStreamReader;

  import java.io.UnsupportedEncodingException;

  import java.net.MalformedURLException;

  import java.net.URL;

  import java.util.ArrayList;

  import java.util.HashMap;

  import java.util.List;

  import java.util.regex.Matcher;

  import java.util.regex.Pattern;

  import dbTools.DBTools;

  public class IOTOWeb {

  public String getHtmlContent(String htmlURL) {

  URL url = null;

  String rowContent = "";

  StringBuffer htmlContent = new StringBuffer();

  try {

  url = new URL(htmlURL);

  BufferedReader in = new BufferedReader(new InputStreamReader(url

  .openStream(), "gb2312"));

  while ((rowContent = in.readLine()) != null) {

  htmlContent.append(rowContent);

  }

  in.close();

  } catch (MalformedURLException e) {

  // TODO Auto-generated catch block

  e.printStackTrace();

  } catch (UnsupportedEncodingException e) {

  // TODO Auto-generated catch block

  e.printStackTrace();

  } catch (IOException e) {

  // TODO Auto-generated catch block

  e.printStackTrace();

  }

  return htmlContent.toString();

  }

  public List getLink(String htmlContent) {

  ArrayList listLink = new ArrayList();

  String regex = "<td[^>]*>[\\(]*<a[^>]*href=(\"([^\"]*)\"|\'([^\']*)\'|([^\\s>]*))[^>]*>(.*?)[\\)]*[\\s]*</td>";

  Pattern pattern = Pattern.compile(regex, Pattern.DOTALL);

  Matcher matcher = pattern.matcher(htmlContent);

  while (matcher.find()) {

  listLink.add(matcher.group());

  }

  return listLink;

  }

  public List<String> getHref(String htmlContent) {

  String regex;

  List listtHref = new ArrayList();

  regex = "href=(\"([^\"]*)\"|\'([^\']*)\'|([^\\s>]*))\"";

  Pattern pa = Pattern.compile(regex, Pattern.DOTALL);

  Matcher ma = pa.matcher(htmlContent);

  while (ma.find()) {

  listtHref.add(ma.group().replaceFirst("href=\"", "").replace("\"",

  ""));

  }

  return listtHref;

  }

  public List<String> getPerson(String htmlContent) {

  String regex;

  List list = new ArrayList();

  regex = "\\(<a[^>]*href=(\"([^\"]*)\"|\'([^\']*)\'|([^\\s>]*))[^>]*>(.*?)\\)";

  Pattern pa = Pattern.compile(regex, Pattern.DOTALL);

  Matcher ma = pa.matcher(htmlContent);

  while (ma.find()) {

  list.add(ma.group().replaceFirst("href=\"", "").replace("\"", ""));

  }

  return list;

  }

  public List<String> getSongName(String htmlContent) {

  String regex;

  List listPerson = new ArrayList();

  regex = "<a[^>]*href=(\"([^\"]*)\"|\'([^\']*)\'|([^\\s>]*))[^>]*>(.*?)</a>\\s";

  Pattern pa = Pattern.compile(regex, Pattern.DOTALL);

  Matcher ma = pa.matcher(htmlContent);

  while (ma.find()) {

  listPerson.add(ma.group());

  }

  return listPerson;

  }

  public String getMainContent(String htmlContent) {

  String regex = "<table width=\"100%\" align=\"center\" cellpadding=\"0\" cellspacing=\"0\" class=\"list\">(.*?)</table>";

  StringBuffer mainContent = new StringBuffer();

  Pattern pattern = Pattern.compile(regex, Pattern.DOTALL);

  Matcher matcher = pattern.matcher(htmlContent);

  while (matcher.find()) {

  mainContent.append(matcher.group());

  }

  return mainContent.toString();

  }

  public String outTag(final String s) {

  return s.replaceAll("<.*?>", "");

  }

  DBTools dbTools = new DBTools();

  public void getFromBaiduMap3(String htmlURL) throws Throwable {

  HashMap htmlContentMap = new HashMap();

  String htmlContent = getHtmlContent(htmlURL);

  String mainContent = getMainContent(htmlContent);

  List listLink = getLink(mainContent);

  for (int j = 0; j < listLink.size(); j++) {

  String tdTag = listLink.get(j).toString();

  List songNameList = getSongName(tdTag);

  String songName = outTag(songNameList.get(0).toString());

  List personList = getPerson(tdTag);

  String songPerson = "";

  if (personList.size() != 0) {

  for (int n = 0; n < personList.size(); n++) {

  // System.out.println(personList.get(n).toString());

  songPerson = outTag(personList.get(n).toString());

  }

  } else {

  songPerson = "无";

  }

广告合作:400-664-0084 全国热线:400-664-0084
Copyright 2010 - 2017 www.my8848.com 珠峰网 粤ICP备15066211号
珠峰网 版权所有 All Rights Reserved