解析HTML文档
用Dom解析HTML其实跟解析XML过程差不多的,主要区别就在如何把HTML OR XML parser成 org.w3c.dom.Document,经过这一步,后面的处理方法都一样了。
CyberNeko HTML Parser 是一个非常优秀的网页parser工具,可以用它来实现这一点。
其它的不多说,贴两个工具内出来,看看就能全部明白。
package org.dutir.util.dom;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.net.URL;
import org.cyberneko.html.parsers.DOMParser;
import org.w3c.dom.Document;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
/**
* @author YeZheng
*
*/
public class HtmlFileToDocument {
/**
* Get the xml document from a url.
*
* @param url
* @param encoding
* @return
* @throws IOException
* @throws SAXException
*/
public Document getDocument(URL url, String encoding) throws IOException,
SAXException {
InputStream inputStream = url.openStream();
try {
return getDocument(inputStream, encoding);
} finally {
inputStream.close();
}
}
/**
* Get the xml document from a html file.
*
* @param file
* @param encoding
* @return
* @throws SAXException
* @throws IOException
*/
public Document getDocument(File file, String encoding)
throws SAXException, IOException {
FileInputStream inputStream = new FileInputStream(file);
try {
return getDocument(inputStream, encoding);
} finally {
inputStream.close();
}
}
/**
* Get the xml document from a html input stream.
*
* @param inputStream
* @param encoding
* @return
* @throws SAXException
* @throws IOException
*/
public Document getDocument(InputStream inputStream, String encoding)
throws SAXException, IOException {
Reader reader = new InputStreamReader(inputStream, encoding);
return getDocument(reader);
}
public Document getDocument(byte[] contents, String encoding) throws SAXException, IOException{
ByteArrayInputStream bis = null ;
try{
bis = new ByteArrayInputStream(contents);
return getDocument(bis, encoding);
}finally{
bis.close();
}
}
/**
* Get the xml dom document from a reader.
*
* @param characterStream
* @return the xml dom of the character stream.
* @throws SAXException
* @throws IOException
*/
public Document getDocument(Reader characterStream) throws SAXException,
IOException {
DOMParser parser = new DOMParser();
// parser.setFeature(
// "http://apache.org/xml/features/scanner/notify-builtin-refs",
// true);
InputSource inputSource = new InputSource();
inputSource.setCharacterStream(characterStream);
parser.parse(inputSource);
return parser.getDocument();
}
}
/**
* Created on 2006-6-28 23:42:54
*/
package org.dutir.util.dom;
import java.io.StringWriter;
import java.util.ArrayList;
import java.util.List;
import java.util.logging.Logger;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import org.dutir.parser.DomNodeHandle;
import org.dutir.util.LogFormatter;
import org.dutir.util.Pair;
import org.dutir.util.net.HtmlFormSelect;
import org.w3c.dom.Element;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.w3c.dom.Text;
import com.aliasi.util.Iterators.Array;
/**
*
* @author YeZheng
*/
public class DomNodeUtils {
private static final Logger log = LogFormatter.getLogger(DomNodeUtils.class);
/**
* skip the "#text" node
* @param node
* @return
*/
public static Node getNextSibling(Node node){
Node nd = node.getNextSibling();
if(nd != null &&nd.getNodeName().equals("#text")){
return getNextSibling(nd);
}
return nd;
}
/**
* A framework used to process diversity of nodes
* @param node
*/
public static void recTravel(Node node, DomNodeHandle handle){
// System.out.println(node.getNodeName());
if(!handle.handle(node)){//if the node is not what we are interested, retrieve its chinldNodes
NodeList nlist = node.getChildNodes();
int len = nlist.getLength();
for(int i=0; i < len; i++){
Node nd = nlist.item(i);
recTravel(nd, handle);
}
}
}
/**
* Get all the text, include text in childNode
* @param sb
* @param node
*/
public static void getText(StringBuffer sb, Node node) {
if (node.getNodeType() == Node.TEXT_NODE) {
sb.append(node.getNodeValue());// 取得结点值,即开始与结束标签之间的信息
}
// System.out.println(node.getNodeName());
NodeList children = node.getChildNodes();
if (children != null) {
int len = children.getLength();
for (int i = 0; i < len; i++) {
Node nd = children.item(i);
getText(sb, nd);// 递归遍历DOM树
}
}
}
public static String getText(Node node){
StringBuffer buf = new StringBuffer();
getText(buf, node);
return buf.toString();
}
/**
* get text only if the node is a text-like Node,
* see the source, if you want to get a detailed point
* @param node
* @return
*/
public static String getTextContent(Node node) {
String ret = null;
// System.out.println("node type:" + node.getNodeType());
switch (node.getNodeType()) {
case Node.ELEMENT_NODE:
case Node.ATTRIBUTE_NODE:
case Node.ENTITY_NODE:
case Node.ENTITY_REFERENCE_NODE:
case Node.DOCUMENT_FRAGMENT_NODE:
Node child = node.getFirstChild();
if (child != null) {
ret = child.getNodeValue();
}
break;
case Node.TEXT_NODE:
case Node.CDATA_SECTION_NODE:
case Node.COMMENT_NODE:
case Node.PROCESSING_INSTRUCTION_NODE:
ret = node.getNodeValue();
break;
case Node.DOCUMENT_NODE:
case Node.DOCUMENT_TYPE_NODE:
case Node.NOTATION_NODE:
ret = null;
break;
}
return ret;
}
public static String getTitle(org.w3c.dom.Document domdoc) {
NodeList nodes = domdoc.getElementsByTagName("title");
if(nodes== null || nodes.getLength() <1){
return null;
}
return getTextContent(nodes.item(0));
}
/**
*
* @param node
* @return
* @deprecated Use {@link #getXmlAsString(Node)} instead.
*/
public static String toString(Node node) {
StringBuffer sb = new StringBuffer();
if (node instanceof Text) {
if (node.getNodeValue() != null) {
byte[] bytes = node.getNodeValue().getBytes();
List<Byte> newBytes = new ArrayList<Byte>();
for (int i = 0; i < bytes.length; i++) {
if (bytes[i] == 63) {
byte[] bs = " ".getBytes();
for (byte b : bs) {
newBytes.add(b);
}
} else {
newBytes.add(bytes[i]);
}
}
byte[] valueBytes = new byte[newBytes.size()];
for (int i = 0; i < newBytes.size(); i++) {
valueBytes[i] = newBytes.get(i).byteValue();
}
sb.append(new String(valueBytes));
}
} else {
sb.append("<").append(node.getNodeName());
NamedNodeMap attrs = node.getAttributes();
for (int j = 0; j < attrs.getLength(); j++) {
sb.append(" ").append(attrs.item(j).getNodeName())
.append("=\"").append(attrs.item(j).getNodeValue())
.append("\"");
}
sb.append(">");
if (node.hasChildNodes()) {
NodeList children = node.getChildNodes();
for (int i = 0; i < children.getLength(); i++) {
String str = toString(children.item(i));
if (str != null)
sb.append(str);
}
} else {
sb.append(node.getNodeValue());
}
sb.append("</").append(node.getNodeName()).append(">");
}
return sb.toString();
}
public static Node getChildNodeByName(Node node, String tag){
String name = node.getNodeName();
if(tag.equalsIgnoreCase(name)){
return node;
}else{
NodeList nlist = node.getChildNodes();
for(int i=0, n = nlist.getLength(); i < n; i++){
Node nl = nlist.item(i);
getChildNodeByName(nl, tag);
}
}
return null;
}
public static ArrayList<Pair> getLinks(Node node){
ArrayList<Pair> retlist = new ArrayList<Pair>();
getLinks(node, retlist);
return retlist;
}
public static void getLinks(Node node, ArrayList<Pair> list) {
if (node != null && node.getNodeName().equalsIgnoreCase("a")) {
NamedNodeMap map = node.getAttributes();
if(map != null ){
Node nd = map.getNamedItem("href");
if(nd !=null){
String link = nd.getNodeValue();
String anchor = DomNodeUtils.getText(node);
Pair <String, String> o= new Pair<String, String>(anchor, link);
list.add(o);
}
}
} else {
NodeList nlist = node.getChildNodes();
for (int i = 0, n = nlist.getLength(); i < n; i++) {
Node nl = nlist.item(i);
getLinks(nl, list);
}
}
}
private static Transformer getTransformer() throws TransformerException {
TransformerFactory factory = TransformerFactory.newInstance();
Transformer transformer = factory.newTransformer();
transformer.setOutputProperty(OutputKeys.INDENT, "yes");
transformer.setOutputProperty(OutputKeys.STANDALONE, "yes");
transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes");
return transformer;
}
/**
* @param node
* @return output of the node's xml string.
* @throws TransformerException
*/
public static String getXmlAsString(Node node) throws TransformerException {
Transformer transformer = getTransformer();
DOMSource source = new DOMSource(node);
StringWriter xmlString = new StringWriter();
StreamResult streamResult = new StreamResult(xmlString);
transformer.transform(source, streamResult);
return xmlString.toString();
}
/**
* 获取某节点下的子节点(包括孙子……节点)的标签名为tagName的节点。
*
* @param node
* @param tagName
* @return
*/
public static NodeList getElementsByTagName(Node node, String tagName) {
// TODO
return null;
}
/**
* A <code>select</code> html element, parse it, and return the html form
* select. pair.
*
* @param select
* a <code>select</code> html element.
* @return the html form select.
*/
public static HtmlFormSelect getSelect(Element select) {
HtmlFormSelect hfs = new HtmlFormSelect();
// if (log.isDebugEnabled()) {
// try {
// log.debug(getXmlAsString(select));
// } catch (TransformerException e) {
// log.warn(e);
// }
// }
hfs.setName(select.getAttribute("name"));
NodeList options = select.getElementsByTagName("option");
if (options.getLength() == 0) {// Hack it as html and xhtml are not
// same in capitalization-sensitivity.
options = select.getElementsByTagName("OPTION");
}
List<String> candidateValues = new ArrayList<String>(options
.getLength());
List<String> candidateLabels = new ArrayList<String>(options
.getLength());
List<String> values = new ArrayList<String>(options.getLength());
List<String> labels = new ArrayList<String>(options.getLength());
String value, label;
for (int i = 0; i < options.getLength(); i++) {
Element option = (Element) options.item(i);
value = option.getAttribute("value");
label = option.getFirstChild().getNodeValue();
candidateValues.add(value);
candidateLabels.add(label);
Node selectedNode = option.getAttributes().getNamedItem("selected");
// log.debug("is selectedNode null: " + (selectedNode == null));
// if (selectedNode != null) {
// values.add(value);
// labels.add(label);
// }
}
// 如果没有任何被 selected,那么默认是选择的第一个。
if (values.size() == 0 && options.getLength() != 0) {
Element option = (Element) options.item(0);
values.add(option.getAttribute("value"));
labels.add(option.getFirstChild().getNodeValue());
}
String[] candidateValueStrings = new String[candidateValues.size()];
candidateValues.toArray(candidateValueStrings);
hfs.setCandidateValues(candidateValueStrings);
String[] candidateLabelStrings = new String[candidateLabels.size()];
candidateLabels.toArray(candidateLabelStrings);
hfs.setCandidateLabels(candidateLabelStrings);
String[] valueStrings = new String[values.size()];
values.toArray(valueStrings);
hfs.setValues(valueStrings);
String[] labelStrings = new String[labels.size()];
labels.toArray(labelStrings);
hfs.setLabels(labelStrings);
return hfs;
}
}
Java解析XML文档(简单实例)——dom解析xml
一、前言
用Java解析XML文档,最常用的有两种方法:使用基于事件的XML简单API(Simple API for XML)称为SAX和基于树和节点的文档对象模型(Document Object Module)称为DOM。Sun公司提供了Java API for XML Parsing(JAXP)接口来使用SAX和DOM,通过JAXP,我们可以使用任何与JAXP兼容的XML解析器。
JAXP接口包含了三个包:
(1)org.w3c.dom W3C推荐的用于XML标准规划文档对象模型的接口。
(2)org.xml.sax 用于对XML进行语法分析的事件驱动的XML简单API(SAX)
(3)javax.xml.parsers解析器工厂工具,程序员获得并配置特殊的特殊语法分析器。
二、前提
DOM编程不要其它的依赖包,因为JDK里自带的JDK里含有的上面提到的org.w3c.dom、org.xml.sax 和javax.xml.parsers包就可以满意条件了。
三、使用DOM解析XML文档
我们现在来看看DOM是如何解析XML的吧!同样的,我将从一个简单的不能再简单的例子来说明DOM是如何解析XML文档的,先让我们看看XML是什么内容吧:
<?xml version="1.0" encoding="gb2312"?>
<books>
<book email="zhoujunhui">
<name>rjzjh</name>
<price>jjjjjj</price>
</book>
</books>
简单的不能再简单了。但是该有的都有了,根元素、属性、子节点。好了,能反应问题就行了,下面来看看解析这个XML文件的Java代码吧!
1 public class DomParse {
2 public DomParse(){
3 DocumentBuilderFactory domfac=DocumentBuilderFactory.newInstance();
4 try {
5 DocumentBuilder dombuilder=domfac.newDocumentBuilder();
6 InputStream is=new FileInputStream("bin/library.xml");
7 Document doc=dombuilder.parse(is);
8
9 Element root=doc.getDocumentElement();
10 NodeList books=root.getChildNodes();
11 if(books!=null){
12 for(int i=0;i<books.getLength();i++){
13 Node book=books.item(i);
14 if(book.getNodeType()==Node.ELEMENT_NODE){
15 String email=book.getAttributes().getNamedItem("email").getNodeValue();
16 System.out.println(email);
17 for(Node node=book.getFirstChild();node!=null;node=node.getNextSibling()){
18 if(node.getNodeType()==Node.ELEMENT_NODE){
19 if(node.getNodeName().equals("name")){
20 String name=node.getNodeValue();
21 String name1=node.getFirstChild().getNodeValue();
22 System.out.println(name);
23 System.out.println(name1);
24 }
25 if(node.getNodeName().equals("price")){
26 String price=node.getFirstChild().getNodeValue();
27 System.out.println(price);
28 }
29 }
30 }
31 }
32 }
33 }
34 } catch (ParserConfigurationException e) {
35 e.printStackTrace();
36 } catch (FileNotFoundException e) {
37 e.printStackTrace();
38 } catch (SAXException e) {
39 e.printStackTrace();
40 } catch (IOException e) {
41 e.printStackTrace();
42 }
43 }
44 public static void main(String[] args) {
45 new DomParse();
46 }
47 }
四、代码解释
先看看这个程序引用类:
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
//下面主要是org.xml.sax包的类
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;
上面那么简单的代码一看就明白了,但是为了介绍个DOM编程的大概还是来看看这个程序吧:
(1)得到DOM解析器的工厂实例
DocumentBuilderFactory domfac=DocumentBuilderFactory.newInstance();
得到javax.xml.parsers.DocumentBuilderFactory;类的实例就是我们要的解析器工厂
(2)从DOM工厂获得DOM解析器
DocumentBuilder dombuilder=domfac.newDocumentBuilder();
通过javax.xml.parsers.DocumentBuilderFactory实例的静态方法newDocumentBuilder()得到DOM解析器
(3)把要解析的XML文档转化为输入流,以便DOM解析器解析它
InputStream is=new FileInputStream("bin/library.xml");
InputStream是一个接口。
(4)解析XML文档的输入流,得到一个Document
Document doc=dombuilder.parse(is);
由XML文档的输入流得到一个org.w3c.dom.Document对象,以后的处理都是对Document对象进行的
(5)得到XML文档的根节点
Element root=doc.getDocumentElement();
在DOM中只有根节点是一个org.w3c.dom.Element对象。
(6)得到节点的子节点
NodeList books=root.getChildNodes();
for(int i=0;i<books.getLength();i++){
Node book=books.item(i);
}
这是用一个org.w3c.dom.NodeList接口来存放它所有子节点的,还有一种轮循子节点的方法,后面有介绍
(7)取得节点的属性值
String email=book.getAttributes().getNamedItem("email").getNodeValue();
System.out.println(email);
注意,节点的属性也是它的子节点。它的节点类型也是Node.ELEMENT_NODE
(8)轮循子节点
for(Node node=book.getFirstChild();node!=null;node=node.getNextSibling()){
if(node.getNodeType()==Node.ELEMENT_NODE){
if(node.getNodeName().equals("name")){
String name=node.getNodeValue();
String name1=node.getFirstChild().getNodeValue();
System.out.println(name);
System.out.println(name1);
}
if(node.getNodeName().equals("price")){
String price=node.getFirstChild().getNodeValue();
System.out.println(price);
}
}
这段代码的打印输出为:
null
alterrjzjh
jjjjjj
从上面可以看出
String name=node.getNodeValue(); 是一个空值。而
String name1=node.getFirstChild().getNodeValue(); 才是真正的值,这是因为DOM把<name>rjzjh</name>也当作是两层结构的节点,其父节点是<name>,子节点rjzjh才是我们真正想得到的。