import?java.io.Reader;
import?java.io.StringReader;
import?org.apache.lucene.analysis.*;
import?org.apache.lucene.analysis.cjk.CJKAnalyzer;
import?org.apache.lucene.analysis.cn.ChineseAnalyzer;
import?org.apache.lucene.analysis.standard.StandardAnalyzer;
import?org.mira.lucene.analysis.MIK_CAnalyzer;
public?class?JeAnalyzer?{
public?static?void?testStandard(String?testString)?{
try?{
Analyzer?analyzer?=?new?StandardAnalyzer();
Reader?r?=?new?StringReader(testString);
StopFilter?sf?=?(StopFilter)?analyzer.tokenStream("",?r);
System.err.println("=====standard?analyzer====");
Token?t;
while?((t?=?sf.next())?!=?null)?{
System.out.println(t.termText());
}
}?catch?(Exception?e)?{
e.printStackTrace();
public?static?void?testCJK(String?testString)?{
Analyzer?analyzer?=?new?CJKAnalyzer();
System.err.println("=====cjk?analyzer====");
public?static?void?testChiniese(String?testString)?{
Analyzer?analyzer?=?new?ChineseAnalyzer();
TokenFilter?tf?=?(TokenFilter)?analyzer.tokenStream("",?r);
System.err.println("=====chinese?analyzer====");
while?((t?=?tf.next())?!=?null)?{
String?result?=?"";
Analyzer?analyzer?=?new?MIK_CAnalyzer();
TokenStream?ts?=?(TokenStream)?analyzer.tokenStream("",?r);
while?((t?=?ts.next())?!=?null)?{
result?+=?t.termText()?+?",";
return?result;
public?static?void?main(String[]?args)?{
String?testString?=?"中文分词的方法其实不局限于中文应用,也被应用到英文处理,如手写识别,单词之间的空格就很清楚,中文分词方法可以帮助判别英文单词的边界";
System.out.println("测试的语句?"+testString);
for?(int?i?=?0;?i?sResult.length;?i++)?{
System.out.println(sResult[i]);
jar包?
需要commons-io包,?或者自己写读文件的部分
import?java.io.File;
import?java.io.IOException;
import?java.util.ArrayList;
import?java.util.Collections;
import?java.util.Comparator;
import?java.util.List;
import?java.util.regex.Matcher;
import?java.util.regex.Pattern;
import?org.apache.commons.io.FileUtils;
/**
*/
//?TODO?Auto-generated?method?stub
String?str?=?null;
str?=?FileUtils.readFileToString(new?File("e.txt"));
}?catch?(IOException?e)?{
//?TODO?Auto-generated?catch?block
Pattern?p?=?Pattern.compile("\\b[\\w-']+\\b");
Matcher?m?=?p.matcher(str);
ListWord?words?=?new?ArrayListWord();
while(m.find()){
add(words,?m.group().trim());
Collections.sort(words,?new?ComparatorWord(){
}});
System.out.println(words);
private?static?void?add(ListWord?words,?String?word)?{
for(Word?temp?:?words){
if(temp.getWord().equals(word)){
temp.setCount(temp.getCount()?+?1);
return;
Word?w?=?new?Word();
w.setWord(word);
words.add(w);
class?Word{
private?String?word;
private?int?count?=?1;
public?String?getWord()?{
return?word;
public?void?setWord(String?word)?{
this.word?=?word;
public?int?getCount()?{
return?count;
public?void?setCount(int?count)?{
this.count?=?count;
public?String?toString()?{
return?"Word?[word="?+?word?+?",?count="?+?count?+?"]";
现可以提供两种思路:
①String或是StringBuffer(建议用) 中的indexOf("中华")方法,查找给定的的字符串中是否有给定词表中的词.
先编写一个状态机,用于测试给定字符串中的词是否满足词表中的内容.
写在最后:1)建议使用第一种方法,因为在java 内部实现的查找操作其实 和你想得思路是相同的,不过他的效率会高些.
如果你的分词规则是在一个字符串的开头和结尾加上"_",然后两个字符一分的话,代码可以这样写:
import java.util.ArrayList;
import java.util.List;
public class Participle
{
private static final String HEAD_END_STR = "_";
public static void main(String[] args)
String exampleWord = "计算机";
exampleWord = "_" + exampleWord + "_";
int length = exampleWord.length();
ListString result = new ArrayListString();
for (int i = 0; i length - 1; i++)
String str = exampleWord.substring(i, i + PARTICIPLE_LENGTH);
result.add(str);
System.out.println(result);
输出结果:_计, 计算, 算机, 机_
以上就是土嘎嘎小编为大家整理的java文本分词代码相关主题介绍,如果您觉得小编更新的文章只要能对粉丝们有用,就是我们最大的鼓励和动力,不要忘记讲本站分享给您身边的朋友哦!!