2008-04-23

依存树解析的JAVA接口


摘要:

本文通过代码形式将调用依存树解析的JAVA接口曾现给读者。有了这些代码和哈工大的共享资源库,就可以编译生成完整的JAVA可调用的DLL文件。这篇文章没有讲解基本的JNI技术,对JNI不熟悉的读者请参考相关文章。

本文JNI
cpp代码的两大看点:

1、解决中文在使用JNI技术出现乱码的问题。

2、在C++端直接生成JAVA的ArrayList对象。

正文:

近日使用哈工大的句法分析模块(依存树解析)做一些小实验。为了方便JAVA调用,利用JNI技术写了一个java调用接口。其他如分词等等的做法也可利用本文做参考。

注:

VC工程 需要哈工大的程序库相关DLL,头文件(__ltp_dll.h)和CPP文件(__ltp_dll_x.cpp)的支持。

使用的哈工大程序库版本为1.3.2
XML函数库使用的是jdom(版本号未知,需要的读者可以给我发EMAIL索取)。

1、
JAVA类:DependencyTreeNode,表示依存树上的一个节点


package hit_irlab.jni;
//
表示一个 依存树 上的节点
public class DependencyTreeNode {
public int idInDocument =
0;// 表示在文档中的id,从0开始
public int idInParagraph = 0;// 在段落中的 id,从0开始
public
int idInSentence = 0;// 表示在句子中的id,从0开始
public String content = null;//
表示词的内容
public String pos = null;// 表示词性标注
public int parent = -2; //
表示指向该词的 节点的 idInSentence,-2表示没有任何词指向该词;-1表示句子的结束标记(EOS)
public String relate
= null;// 表示两个词间的依存关系
public String toString(){
return content+"/"+pos +
",parent="+parent+",relate="+relate;
}
}

2、
JAVA类:DependencyTree:用于访问依存树解析的JAVA类


package hit_irlab.jni;
import
java.io.*;
import java.util.*;
import org.jdom.Document;
import
org.jdom.Element;
import org.jdom.output.Format;
import
org.jdom.output.XMLOutputter;
// 表示一棵依存树
public class DependencyTree
{
//表示整个文档的依存树节点
// elements:
ArrayList<>(段落构成的ArrayList)
// 段落 由 句子的 ArrayList 构成
//
每一个句子 ArrayList 由 DependencyTreeNode 组成
//
ArrayList<ArrayList<ArrayList<DependencyTreeNode>>>

private ArrayList documentTreeNodeList = null;
public DependencyTree(File
txtFile) {
documentTreeNodeList =
getDocumentTreeNodesFromTxtFile(txtFile.
getAbsolutePath());

}
public DependencyTree(String str) {
documentTreeNodeList =
getDocumentTreeNodesFromString(str);
}
// 取得段落列表
public
ArrayList getParagraphList() {
return documentTreeNodeList;

}
// 取得句子列表
public ArrayList getSentenceList() {

ArrayList ret = new ArrayList();
for (int i = 0; i <
documentTreeNodeList.size(); i++) {
// 按照段落循环

ArrayList paragraphList = (ArrayList) documentTreeNodeList.get(i);

ret.addAll(paragraphList);
}
return ret;
}

// 取得所有节点列表
public ArrayList getTreeNodeList() {
ArrayList
ret = new ArrayList();
for (int i = 0; i <
documentTreeNodeList.size(); i++) {
// 按照段落循环

ArrayList paragraphList = (ArrayList) documentTreeNodeList.get(i);

for (int j = 0; j < paragraphList.size(); j++) {

ArrayList sentenceList = (ArrayList) paragraphList.get(j);

ret.addAll(sentenceList);
}
}
return
ret;
}
// 利用本地方法,获得一个 文档的所有 依存树节点列表
private native
ArrayList getDocumentTreeNodesFromTxtFile(String filePath);
//
利用本地方法,获得一个 文档的所有 依存树节点列表
private native ArrayList
getDocumentTreeNodesFromString(String str);
static {

System.loadLibrary("dependencyTreeJni");
}

/**

* 将得到的解析树,保存为 xml 文件
* @param xmlFile
* @throws IOException

*/
public void saveToXml(File xmlFile) throws
IOException{
Element newroot = new Element("xml4nlp");
Document
newdocument = new Document (newroot);

Element noteElement = new
Element("note");
noteElement.setAttribute("sent","y");
noteElement.setAttribute("word","y");
noteElement.setAttribute("pos","y");
noteElement.setAttribute("ne","n");
noteElement.setAttribute("parser","y");
noteElement.setAttribute("wsd","n");
noteElement.setAttribute("srl","n");
noteElement.setAttribute("class","n");
noteElement.setAttribute("sum","n");
noteElement.setAttribute("cr","n");

newroot.addContent(
noteElement );// 添加Note

Element newdoc = new
Element("doc");
newroot.addContent(newdoc);

// System.out.println(documentTreeNodeList);

//
按照段落进行循环
for (int i = 0; i < documentTreeNodeList.size(); i++) {

ArrayList paragraphList = (ArrayList) documentTreeNodeList.get(i);


Element paraElement = new Element("para");

paraElement.setAttribute("id",""+i);

newdoc.addContent(paraElement);

///
按照句子进行循环
for (int j = 0; j < paragraphList.size(); j++) {

ArrayList sentenceList = (ArrayList) paragraphList.get(j);


Element sentElement = new Element("sent");

sentElement.setAttribute("id",""+j);

paraElement.addContent(sentElement);


String sentC;
for (int k = 0; k < sentenceList.size();
k++) {
DependencyTreeNode node = (DependencyTreeNode)
sentenceList.get(k);

Element word
= new Element("word");
word.setAttribute("id",""+k);

word.setAttribute("cont",node.content);

word.setAttribute("pos",node.pos);

word.setAttribute("parent",""+node.parent);

word.setAttribute("relate",node.relate);

sentElement.addContent(word);


sentContent += node.content;
}


sentElement.setAttribute("cont",sentContent);


}
}

//输出这个xml文件
// System.out.println(""+
xmlFile.getAbsolutePath());
tool.XmlTool.OutputDocToFile(newdocument,
"nlp_style.xsl","gb2312", xmlFile.getAbsolutePath());

}

/**
* 演示使用方法
* @param args String[]
* @throws
IOException
*/
public static void usage() throws IOException
{
// 可以使用文本文件,或者字符串构建 依存树
DependencyTree tree = new
DependencyTree("The board forced him to resign.");// 使用字符串
//
DependencyTree tree = new DependencyTree(new
File("test.txt"));//使用文本文件
// ArrayList nodeList =
tree.getTreeNodeList();
// for (int i = 0; i < nodeList.size();
i++) {
// DependencyTreeNode node = (DependencyTreeNode)
nodeList.get(i);
// System.out.print("字符串内容:"+ node.content + "\t"
);
// System.out.print("词性标注:"+node.pos + "\t");
//
System.out.print("该节点在该句子中的id:" + node.idInSentence + "\t");
//
System.out.print("指向该节点的节点在句子中的id:" + node.parent + "\t");
//
System.out.println("两个节点之间的关系:"+node.relate);
// }


tree.saveToXml(new File("test_parse_en.xml"));
}
/**

* @param args
* @throws IOException
*/
public static
void main(String[] args) throws IOException {
long start =
System.currentTimeMillis();
usage();
long end =
System.currentTimeMillis();
System.out.println("use
time:"+(end-start)+" :ms");
// DependencyTree tree = new DependencyTree(new
File("test.txt"));
// DependencyTree tree = new
DependencyTree("我的名字是胡宝顺");
//
System.out.println(tree.getTreeNodeList());
//
//
System.out.println(tree.getSentenceList());
//
System.out.println(tree.getParagraphList());
}
}

3、
JAVA类:XmlTool:用于输出XML文件的类


package tool;
import
java.io.FileWriter;
import java.io.IOException;
import
org.jdom.Document;
import org.jdom.output.Format;
import
org.jdom.output.XMLOutputter;
public class XmlTool
{
public static void OutputDocToFile(Document doc, String xslpath,

String encoding, String filePath) throws
IOException {
// setup this like outputDocument
Format format
= Format.getPrettyFormat();
format.setEncoding(encoding);

XMLOutputter outputter =new XMLOutputter(format);


FileWriter out = new FileWriter(filePath);

//<?xml
version="1.0" encoding="gb2312" ?>

out.write("<?xml version=\"1.0\"");

if (encoding != null && encoding.length() > 1) {
//
编码的名字至少有一个字符
out.write(" encoding=\"" + encoding + "\"");

}
out.write("?>");
out.write("\r\n");


if (xslpath != null && xslpath.length() > 3) {
//
xsl 文件的名字至少有三个字符

out.write("<?xml-stylesheet type=\"text/xsl\" href=\"" + xslpath + "\"?>");

out.write("\r\n");
}

outputter.output(doc.getRootElement(), out);

out.close();

}

}

4、
JNI的C/C++头文件:hit_irlab_jni_DependencyTree.h


/* DO NOT EDIT THIS FILE
- it is machine generated */
#include <jni.h>
/* Header for class
hit_irlab_jni_DependencyTree */
#ifndef
_Included_hit_irlab_jni_DependencyTree
#define
_Included_hit_irlab_jni_DependencyTree
#ifdef __cplusplus
extern "C"
{
#endif
/*
* Class: hit_irlab_jni_DependencyTree
* Method:
getDocumentTreeNodesFromTxtFile
* Signature:
(Ljava/lang/String;)Ljava/util/ArrayList;
*/
JNIEXPORT jobject JNICALL
Java_hit_1irlab_jni_DependencyTree_getDocumentTreeNodesFromTxtFile
(JNIEnv
*, jobject, jstring);
/*
* Class: hit_irlab_jni_DependencyTree
*
Method: getDocumentTreeNodesFromString
* Signature:
(Ljava/lang/String;)Ljava/util/ArrayList;
*/
JNIEXPORT jobject JNICALL
Java_hit_1irlab_jni_DependencyTree_getDocumentTreeNodesFromString
(JNIEnv
*, jobject, jstring);
#ifdef
__cplusplus
}
#endif
#endif

5、 实现JNI
的DLL的核心CPP文件:dependencyTree.cpp


#include "__ltp_dll.h"
#pragma
comment(lib, "__ltp_dll.lib")
#pragma warning(disable : 4786)
#include
"hit_irlab_jni_DependencyTree.h"
#include <windows.h>
#include
<algorithm>
#include <iterator>
#include
<vector>
#include <string>
#include
<iostream>
#include <utility>
#include <map>
#include
<fstream>
#include <conio.h>
#include <ctime>
using
namespace std;
using namespace HIT_IR_LTP; // Important!

//
字符转换函数,解决中文字符乱码问题
jstring WindowsTojstring( JNIEnv* env, const char* str
)
{
jstring rtn = 0;
int slen = strlen(str);
unsigned short *
buffer = 0;
if( slen == 0 )
rtn = (env)->NewStringUTF(str );

else
{
int length = MultiByteToWideChar( CP_ACP, 0,
(LPCSTR)str, slen, NULL, 0 );
buffer = (unsigned short *)malloc( length*2
+ 1 );
if( MultiByteToWideChar( CP_ACP, 0, (LPCSTR)str, slen,
(LPWSTR)buffer, length ) >0 )
rtn =
(env)->NewString( (jchar*)buffer, length );
}
if( buffer
)
free( buffer );
return rtn;
}
char* jstringToWindows(
JNIEnv *env, jstring jstr )
{
int length =
(env)->GetStringLength(jstr );
const jchar* jcstr =
(env)->GetStringChars(jstr, 0 );
char* rtn = (char*)malloc( length*2+1
);
int size = 0;
size = WideCharToMultiByte( CP_ACP, 0,
(LPCWSTR)jcstr, length, rtn,(length*2+1), NULL, NULL );
if( size <= 0
)
return NULL;
(env)->ReleaseStringChars(jstr, jcstr
);
rtn[size] = 0;
return rtn;
}
// 构建依存树节点的声明
jobject
constructDependencyTreeNode(JNIEnv *env,int idInDocument,int idInParagraph,

int idInSentence,const char *content,const char *pos,int parent,const
char *relate );
//
jobject getDocumentTreeNodes
(JNIEnv *env, jobject
obj, jstring instr, int sourceType );
/*
* Class:
hit_irlab_jni_DependencyTree
* Method:
getDocumentTreeNodesFromTxtFile
* Signature:
(Ljava/lang/String;)Ljava/util/ArrayList;
*/
JNIEXPORT jobject JNICALL
Java_hit_1irlab_jni_DependencyTree_getDocumentTreeNodesFromTxtFile
(JNIEnv
*env, jobject obj, jstring txtFileName){
return
getDocumentTreeNodes(env,obj,txtFileName,1);
}
/*
* 构建依存树的核心函数
* int
sourceType : 1: txt 文本文件; 2: 需要进行分析的字符串
*/
jobject
getDocumentTreeNodes
(JNIEnv *env, jobject obj, jstring instr, int sourceType
){

const char * c_instr = jstringToWindows( env, instr
);
env->ReleaseStringUTFChars( instr, c_instr );//释放传入的参数
//
-------------------------
if ( sourceType == 1 ){
CreateDOMFromTxt(
c_instr );
} else if ( sourceType == 2 ){
CreateDOMFromString( c_instr
);
} else {
printf("wrong sourceType!\n");
return
NULL;
}
Parser();

// 构建一个document arraylist
jclass
class_ArrayList=env->FindClass("java/util/ArrayList");/* 获得Java类
*/
jmethodID construct=env->GetMethodID( class_ArrayList,
"<init>","()V");/* 获得构造方法 */
/* 获得List的add方法 */
jmethodID
list_add=env->GetMethodID(class_ArrayList,"add","(Ljava/lang/Object;)Z");
jobject
documentArrayList = env->NewObject( class_ArrayList, construct, "");/*
创建java对象 */
int idInDocument = 0;
int paraNum =
CountParagraphInDocument();//按照文档中的段落进行循环
for (int k=0; k < paraNum;
++k)
{
// 构建一个段落 ArrayList
jobject paragraphArrayList =
env->NewObject( class_ArrayList, construct,
"");
env->CallObjectMethod(documentArrayList,list_add,paragraphArrayList);
// 将段落添加到 document 上
int idInParagraph = 0;
int sentNum =
CountSentenceInParagraph(k);
for (int j=0; j < sentNum; ++j)
{

// 构建一个句子 ArrayList
jobject sentenceArrayList = env->NewObject(
class_ArrayList, construct, "");

env->CallObjectMethod(paragraphArrayList,list_add,sentenceArrayList); //
将句子添加到 段落 上
int wordNum = CountWordInSentence(k, j);
for (int i=0; i
< wordNum; ++i)
{
pair<int, const char *>
parent_relate;
int ret = GetParse(parent_relate, k, j, i);
if (0
== ret && parent_relate.second != NULL)
{


idInDocument++;
idInParagraph++;
int idInSentence = i;

const char *content = GetWord(k,j,i);
const char *pos =
GetPOS(k,j,i);
int parent = parent_relate.first;
const char
*relate = parent_relate.second;
jobject oneNode =
constructDependencyTreeNode(env,idInDocument,idInParagraph,idInSentence,

content,pos,parent,relate);

/* 调用List 的add方法 */

env->CallObjectMethod(sentenceArrayList,list_add,oneNode);
}

}

}
}
return documentArrayList;
}

/*
* Class:
hit_irlab_jni_DependencyTree
* Method:
getDocumentTreeNodesFromString
* Signature:
(Ljava/lang/String;)Ljava/util/ArrayList;
*/
JNIEXPORT jobject JNICALL
Java_hit_1irlab_jni_DependencyTree_getDocumentTreeNodesFromString
(JNIEnv
*env, jobject obj, jstring instr){
return
getDocumentTreeNodes(env,obj,instr,2);
}

/*
*
构造treeNode对象
*/
jobject constructDependencyTreeNode(JNIEnv *env,int
idInDocument,int idInParagraph,
int idInSentence,const char
*content,const char *pos,int parent,const char *relate
){

/**************创建DependencyTreeNode对象
start*****************/
jclass
class_treeNode=env->FindClass("hit_irlab/jni/DependencyTreeNode");/* 获得Java类
*/
jmethodID construct_treeNode=env->GetMethodID( class_treeNode,
"<init>","()V");/* 获得构造方法 */
jobject obj_treeNode =env->NewObject(
class_treeNode, construct_treeNode, "");/* 创建java对象
*/
/**************创建属性ID***************************/
jfieldID jcontent =
env->GetFieldID(class_treeNode,"content","Ljava/lang/String;");
jfieldID
jpos
= env->GetFieldID(class_treeNode,"pos","Ljava/lang/String;");
jfieldID
jrelate =
env->GetFieldID(class_treeNode,"relate","Ljava/lang/String;");
jfieldID
jidInDocument =
env->GetFieldID(class_treeNode,"idInDocument","I");
jfieldID
jidInParagraph =
env->GetFieldID(class_treeNode,"idInParagraph","I");
jfieldID
jidInSentence =
env->GetFieldID(class_treeNode,"idInSentence","I");
jfieldID jparent =
env->GetFieldID(class_treeNode,"parent","I");

/**************给对象的属性赋值*************************/
env->SetIntField(obj_treeNode,jidInDocument,
idInDocument);
env->SetIntField(obj_treeNode,jidInParagraph,
idInParagraph);
env->SetIntField(obj_treeNode,jidInSentence,
idInSentence);
env->SetIntField(obj_treeNode,jparent,
parent);

env->SetObjectField(obj_treeNode,jcontent, WindowsTojstring(
env, content ) );
env->SetObjectField(obj_treeNode,jpos,
env->NewStringUTF(pos) );
env->SetObjectField(obj_treeNode,jrelate,
env->NewStringUTF(relate) );
return obj_treeNode;
}

0 评论:

Post a Comment

Popular Posts