提交 29d6224f 编写于 作者: weixin_43283383's avatar weixin_43283383

fix issue #31

上级 48a9579d
/data
/work
/logs
/.idea
/target
/out
.DS_Store
*.iml
\.*
/data
/work
/logs
/.idea
/target
/out
.DS_Store
*.iml
\.*
IK Analysis for ElasticSearch
==================================
The IK Analysis plugin integrates Lucene IK analyzer into elasticsearch, support customized dictionary.
Version
-------------
master | 0.0.0 -> master
1.2.6 | 1.0.0
1.2.5 | 0.90.2
1.2.3 | 0.90.2
1.2.0 | 0.90.0
1.1.3 | 0.20.2
1.1.2 | 0.19.x
1.0.0 | 0.16.2 -> 0.19.0
Install
-------------
you can download this plugin from RTF project(https://github.com/medcl/elasticsearch-rtf)
https://github.com/medcl/elasticsearch-rtf/tree/master/elasticsearch/plugins/analysis-ik
https://github.com/medcl/elasticsearch-rtf/tree/master/elasticsearch/config/ik
<del>also remember to download the dict files,unzip these dict file into your elasticsearch's config folder,such as: your-es-root/config/ik</del>
you need a service restart after that!
Dict Configuration (es-root/config/ik/IKAnalyzer.cfg.xml)
-------------
https://github.com/medcl/elasticsearch-analysis-ik/blob/master/config/ik/IKAnalyzer.cfg.xml
<pre>
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE properties SYSTEM "http://java.sun.com/dtd/properties.dtd">
<properties>
<comment>IK Analyzer 扩展配置</comment>
<!--用户可以在这里配置自己的扩展字典 -->
<entry key="ext_dict">custom/mydict.dic;custom/single_word_low_freq.dic</entry>
<!--用户可以在这里配置自己的扩展停止词字典-->
<entry key="ext_stopwords">custom/ext_stopword.dic</entry>
</properties>
</pre>
Analysis Configuration (elasticsearch.yml)
-------------
<Pre>
index:
analysis:
analyzer:
ik:
alias: [ik_analyzer]
type: org.elasticsearch.index.analysis.IkAnalyzerProvider
ik_max_word:
type: ik
use_smart: false
ik_smart:
type: ik
use_smart: true
</pre>
Or
<pre>
index.analysis.analyzer.ik.type : "ik"
</pre>
you can set your prefer segment mode,default `use_smart` is false.
Mapping Configuration
-------------
Here is a quick example:
1.create a index
<pre>
curl -XPUT http://localhost:9200/index
</pre>
2.create a mapping
<pre>
curl -XPOST http://localhost:9200/index/fulltext/_mapping -d'
{
"fulltext": {
"_all": {
"indexAnalyzer": "ik",
"searchAnalyzer": "ik",
"term_vector": "no",
"store": "false"
},
"properties": {
"content": {
"type": "string",
"store": "no",
"term_vector": "with_positions_offsets",
"indexAnalyzer": "ik",
"searchAnalyzer": "ik",
"include_in_all": "true",
"boost": 8
}
}
}
}'
</pre>
3.index some docs
<pre>
curl -XPOST http://localhost:9200/index/fulltext/1 -d'
{content:"美国留给伊拉克的是个烂摊子吗"}
'
curl -XPOST http://localhost:9200/index/fulltext/2 -d'
{content:"公安部:各地校车将享最高路权"}
'
curl -XPOST http://localhost:9200/index/fulltext/3 -d'
{content:"中韩渔警冲突调查:韩警平均每天扣1艘中国渔船"}
'
curl -XPOST http://localhost:9200/index/fulltext/4 -d'
{content:"中国驻洛杉矶领事馆遭亚裔男子枪击 嫌犯已自首"}
'
</pre>
4.query with highlighting
<pre>
curl -XPOST http://localhost:9200/index/fulltext/_search -d'
{
"query" : { "term" : { "content" : "中国" }},
"highlight" : {
"pre_tags" : ["<tag1>", "<tag2>"],
"post_tags" : ["</tag1>", "</tag2>"],
"fields" : {
"content" : {}
}
}
}
'
</pre>
here is the query result
<pre>
{
"took": 14,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 2,
"max_score": 2,
"hits": [
{
"_index": "index",
"_type": "fulltext",
"_id": "4",
"_score": 2,
"_source": {
"content": "中国驻洛杉矶领事馆遭亚裔男子枪击 嫌犯已自首"
},
"highlight": {
"content": [
"<tag1>中国</tag1>驻洛杉矶领事馆遭亚裔男子枪击 嫌犯已自首 "
]
}
},
{
"_index": "index",
"_type": "fulltext",
"_id": "3",
"_score": 2,
"_source": {
"content": "中韩渔警冲突调查:韩警平均每天扣1艘中国渔船"
},
"highlight": {
"content": [
"均每天扣1艘<tag1>中国</tag1>渔船 "
]
}
}
]
}
}
</pre>
have fun.
常见问题:
-------------
1.自定义词典为什么没有生效?
IK Analysis for ElasticSearch
==================================
The IK Analysis plugin integrates Lucene IK analyzer into elasticsearch, support customized dictionary.
Version
-------------
master | 0.0.0 -> master
1.2.6 | 1.0.0
1.2.5 | 0.90.2
1.2.3 | 0.90.2
1.2.0 | 0.90.0
1.1.3 | 0.20.2
1.1.2 | 0.19.x
1.0.0 | 0.16.2 -> 0.19.0
Install
-------------
you can download this plugin from RTF project(https://github.com/medcl/elasticsearch-rtf)
https://github.com/medcl/elasticsearch-rtf/tree/master/elasticsearch/plugins/analysis-ik
https://github.com/medcl/elasticsearch-rtf/tree/master/elasticsearch/config/ik
<del>also remember to download the dict files,unzip these dict file into your elasticsearch's config folder,such as: your-es-root/config/ik</del>
you need a service restart after that!
Dict Configuration (es-root/config/ik/IKAnalyzer.cfg.xml)
-------------
https://github.com/medcl/elasticsearch-analysis-ik/blob/master/config/ik/IKAnalyzer.cfg.xml
<pre>
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE properties SYSTEM "http://java.sun.com/dtd/properties.dtd">
<properties>
<comment>IK Analyzer 扩展配置</comment>
<!--用户可以在这里配置自己的扩展字典 -->
<entry key="ext_dict">custom/mydict.dic;custom/single_word_low_freq.dic</entry>
<!--用户可以在这里配置自己的扩展停止词字典-->
<entry key="ext_stopwords">custom/ext_stopword.dic</entry>
</properties>
</pre>
Analysis Configuration (elasticsearch.yml)
-------------
<Pre>
index:
analysis:
analyzer:
ik:
alias: [ik_analyzer]
type: org.elasticsearch.index.analysis.IkAnalyzerProvider
ik_max_word:
type: ik
use_smart: false
ik_smart:
type: ik
use_smart: true
</pre>
Or
<pre>
index.analysis.analyzer.ik.type : "ik"
</pre>
you can set your prefer segment mode,default `use_smart` is false.
Mapping Configuration
-------------
Here is a quick example:
1.create a index
<pre>
curl -XPUT http://localhost:9200/index
</pre>
2.create a mapping
<pre>
curl -XPOST http://localhost:9200/index/fulltext/_mapping -d'
{
"fulltext": {
"_all": {
"indexAnalyzer": "ik",
"searchAnalyzer": "ik",
"term_vector": "no",
"store": "false"
},
"properties": {
"content": {
"type": "string",
"store": "no",
"term_vector": "with_positions_offsets",
"indexAnalyzer": "ik",
"searchAnalyzer": "ik",
"include_in_all": "true",
"boost": 8
}
}
}
}'
</pre>
3.index some docs
<pre>
curl -XPOST http://localhost:9200/index/fulltext/1 -d'
{content:"美国留给伊拉克的是个烂摊子吗"}
'
curl -XPOST http://localhost:9200/index/fulltext/2 -d'
{content:"公安部:各地校车将享最高路权"}
'
curl -XPOST http://localhost:9200/index/fulltext/3 -d'
{content:"中韩渔警冲突调查:韩警平均每天扣1艘中国渔船"}
'
curl -XPOST http://localhost:9200/index/fulltext/4 -d'
{content:"中国驻洛杉矶领事馆遭亚裔男子枪击 嫌犯已自首"}
'
</pre>
4.query with highlighting
<pre>
curl -XPOST http://localhost:9200/index/fulltext/_search -d'
{
"query" : { "term" : { "content" : "中国" }},
"highlight" : {
"pre_tags" : ["<tag1>", "<tag2>"],
"post_tags" : ["</tag1>", "</tag2>"],
"fields" : {
"content" : {}
}
}
}
'
</pre>
here is the query result
<pre>
{
"took": 14,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 2,
"max_score": 2,
"hits": [
{
"_index": "index",
"_type": "fulltext",
"_id": "4",
"_score": 2,
"_source": {
"content": "中国驻洛杉矶领事馆遭亚裔男子枪击 嫌犯已自首"
},
"highlight": {
"content": [
"<tag1>中国</tag1>驻洛杉矶领事馆遭亚裔男子枪击 嫌犯已自首 "
]
}
},
{
"_index": "index",
"_type": "fulltext",
"_id": "3",
"_score": 2,
"_source": {
"content": "中韩渔警冲突调查:韩警平均每天扣1艘中国渔船"
},
"highlight": {
"content": [
"均每天扣1艘<tag1>中国</tag1>渔船 "
]
}
}
]
}
}
</pre>
have fun.
常见问题:
-------------
1.自定义词典为什么没有生效?
请确保你的扩展词典的文本格式为UTF8编码
\ No newline at end of file
medcl
medcl
\ No newline at end of file
此差异已折叠。
此差异已折叠。
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<name>elasticsearch-analysis-ik</name>
<modelVersion>4.0.0</modelVersion>
<groupId>org.elasticsearch</groupId>
<artifactId>elasticsearch-analysis-ik</artifactId>
<version>1.2.6</version>
<packaging>jar</packaging>
<description>IK Analyzer for ElasticSearch</description>
<inceptionYear>2009</inceptionYear>
<licenses>
<license>
<name>The Apache Software License, Version 2.0</name>
<url>http://www.apache.org/licenses/LICENSE-2.0.txt</url>
<distribution>repo</distribution>
</license>
</licenses>
<scm>
<connection>scm:git:git@github.com:medcl/elasticsearch-analysis-ik.git</connection>
<developerConnection>scm:git:git@github.com:medcl/elasticsearch-analysis-ik.git
</developerConnection>
<url>http://github.com/medcl/elasticsearch-analysis-ik</url>
</scm>
<parent>
<groupId>org.sonatype.oss</groupId>
<artifactId>oss-parent</artifactId>
<version>7</version>
</parent>
<properties>
<elasticsearch.version>1.0.0</elasticsearch.version>
</properties>
<repositories>
<repository>
<id>oss.sonatype.org</id>
<name>OSS Sonatype</name>
<releases><enabled>true</enabled></releases>
<snapshots><enabled>true</enabled></snapshots>
<url>http://oss.sonatype.org/content/repositories/releases/</url>
</repository>
</repositories>
<dependencies>
<dependency>
<groupId>org.elasticsearch</groupId>
<artifactId>elasticsearch</artifactId>
<version>${elasticsearch.version}</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
<version>1.2.16</version>
<scope>runtime</scope>
</dependency>
<dependency>
<groupId>org.hamcrest</groupId>
<artifactId>hamcrest-core</artifactId>
<version>1.3.RC2</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.hamcrest</groupId>
<artifactId>hamcrest-library</artifactId>
<version>1.3.RC2</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.10</version>
<scope>test</scope>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>2.3.2</version>
<configuration>
<source>1.6</source>
<target>1.6</target>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
<version>2.11</version>
<configuration>
<includes>
<include>**/*Tests.java</include>
</includes>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-source-plugin</artifactId>
<version>2.1.2</version>
<executions>
<execution>
<id>attach-sources</id>
<goals>
<goal>jar</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<artifactId>maven-assembly-plugin</artifactId>
<configuration>
<outputDirectory>${project.build.directory}/releases/</outputDirectory>
<descriptors>
<descriptor>${basedir}/src/main/assemblies/plugin.xml</descriptor>
</descriptors>
</configuration>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<name>elasticsearch-analysis-ik</name>
<modelVersion>4.0.0</modelVersion>
<groupId>org.elasticsearch</groupId>
<artifactId>elasticsearch-analysis-ik</artifactId>
<version>1.2.6</version>
<packaging>jar</packaging>
<description>IK Analyzer for ElasticSearch</description>
<inceptionYear>2009</inceptionYear>
<licenses>
<license>
<name>The Apache Software License, Version 2.0</name>
<url>http://www.apache.org/licenses/LICENSE-2.0.txt</url>
<distribution>repo</distribution>
</license>
</licenses>
<scm>
<connection>scm:git:git@github.com:medcl/elasticsearch-analysis-ik.git</connection>
<developerConnection>scm:git:git@github.com:medcl/elasticsearch-analysis-ik.git
</developerConnection>
<url>http://github.com/medcl/elasticsearch-analysis-ik</url>
</scm>
<parent>
<groupId>org.sonatype.oss</groupId>
<artifactId>oss-parent</artifactId>
<version>7</version>
</parent>
<properties>
<elasticsearch.version>1.0.0</elasticsearch.version>
</properties>
<repositories>
<repository>
<id>oss.sonatype.org</id>
<name>OSS Sonatype</name>
<releases><enabled>true</enabled></releases>
<snapshots><enabled>true</enabled></snapshots>
<url>http://oss.sonatype.org/content/repositories/releases/</url>
</repository>
</repositories>
<dependencies>
<dependency>
<groupId>org.elasticsearch</groupId>
<artifactId>elasticsearch</artifactId>
<version>${elasticsearch.version}</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
<version>1.2.16</version>
<scope>runtime</scope>
</dependency>
<dependency>
<groupId>org.hamcrest</groupId>
<artifactId>hamcrest-core</artifactId>
<version>1.3.RC2</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.hamcrest</groupId>
<artifactId>hamcrest-library</artifactId>
<version>1.3.RC2</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.10</version>
<scope>test</scope>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>2.3.2</version>
<configuration>
<source>1.6</source>
<target>1.6</target>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
<version>2.11</version>
<configuration>
<includes>
<include>**/*Tests.java</include>
</includes>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-source-plugin</artifactId>
<version>2.1.2</version>
<executions>
<execution>
<id>attach-sources</id>
<goals>
<goal>jar</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<artifactId>maven-assembly-plugin</artifactId>
<configuration>
<outputDirectory>${project.build.directory}/releases/</outputDirectory>
<descriptors>
<descriptor>${basedir}/src/main/assemblies/plugin.xml</descriptor>
</descriptors>
</configuration>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>
package org.elasticsearch.index.analysis;
public class IkAnalysisBinderProcessor extends AnalysisModule.AnalysisBinderProcessor {
@Override public void processTokenFilters(TokenFiltersBindings tokenFiltersBindings) {
}
@Override public void processAnalyzers(AnalyzersBindings analyzersBindings) {
analyzersBindings.processAnalyzer("ik", IkAnalyzerProvider.class);
super.processAnalyzers(analyzersBindings);
}
@Override
public void processTokenizers(TokenizersBindings tokenizersBindings) {
tokenizersBindings.processTokenizer("ik", IkTokenizerFactory.class);
super.processTokenizers(tokenizersBindings);
}
}
package org.elasticsearch.index.analysis;
public class IkAnalysisBinderProcessor extends AnalysisModule.AnalysisBinderProcessor {
@Override public void processTokenFilters(TokenFiltersBindings tokenFiltersBindings) {
}
@Override public void processAnalyzers(AnalyzersBindings analyzersBindings) {
analyzersBindings.processAnalyzer("ik", IkAnalyzerProvider.class);
super.processAnalyzers(analyzersBindings);
}
@Override
public void processTokenizers(TokenizersBindings tokenizersBindings) {
tokenizersBindings.processTokenizer("ik", IkTokenizerFactory.class);
super.processTokenizers(tokenizersBindings);
}
}
/**
*
*/
package org.wltea.analyzer.cfg;
import java.io.*;
import java.util.ArrayList;
import java.util.InvalidPropertiesFormatException;
import java.util.List;
import java.util.Properties;
import org.elasticsearch.common.logging.ESLogger;
import org.elasticsearch.common.logging.Loggers;
import org.elasticsearch.env.Environment;
public class Configuration {
private static String FILE_NAME = "ik/IKAnalyzer.cfg.xml";
private static final String EXT_DICT = "ext_dict";
private static final String EXT_STOP = "ext_stopwords";
private static ESLogger logger = null;
private Properties props;
private Environment environment;
public Configuration(Environment env){
logger = Loggers.getLogger("ik-analyzer");
props = new Properties();
environment = env;
File fileConfig= new File(environment.configFile(), FILE_NAME);
InputStream input = null;
try {
input = new FileInputStream(fileConfig);
} catch (FileNotFoundException e) {
logger.error("ik-analyzer",e);
}
if(input != null){
try {
props.loadFromXML(input);
} catch (InvalidPropertiesFormatException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
}
public List<String> getExtDictionarys(){
List<String> extDictFiles = new ArrayList<String>(2);
String extDictCfg = props.getProperty(EXT_DICT);
if(extDictCfg != null){
String[] filePaths = extDictCfg.split(";");
if(filePaths != null){
for(String filePath : filePaths){
if(filePath != null && !"".equals(filePath.trim())){
File file=new File("ik",filePath.trim());
extDictFiles.add(file.toString());
}
}
}
}
return extDictFiles;
}
public List<String> getExtStopWordDictionarys(){
List<String> extStopWordDictFiles = new ArrayList<String>(2);
String extStopWordDictCfg = props.getProperty(EXT_STOP);
if(extStopWordDictCfg != null){
String[] filePaths = extStopWordDictCfg.split(";");
if(filePaths != null){
for(String filePath : filePaths){
if(filePath != null && !"".equals(filePath.trim())){
File file=new File("ik",filePath.trim());
extStopWordDictFiles.add(file.toString());
}
}
}
}
return extStopWordDictFiles;
}
public File getDictRoot() {
return environment.configFile();
}
}
/**
*
*/
package org.wltea.analyzer.cfg;
import java.io.*;
import java.util.ArrayList;
import java.util.InvalidPropertiesFormatException;
import java.util.List;
import java.util.Properties;
import org.elasticsearch.common.logging.ESLogger;
import org.elasticsearch.common.logging.Loggers;
import org.elasticsearch.env.Environment;
public class Configuration {
private static String FILE_NAME = "ik/IKAnalyzer.cfg.xml";
private static final String EXT_DICT = "ext_dict";
private static final String EXT_STOP = "ext_stopwords";
private static ESLogger logger = null;
private Properties props;
private Environment environment;
public Configuration(Environment env){
logger = Loggers.getLogger("ik-analyzer");
props = new Properties();
environment = env;
File fileConfig= new File(environment.configFile(), FILE_NAME);
InputStream input = null;
try {
input = new FileInputStream(fileConfig);
} catch (FileNotFoundException e) {
logger.error("ik-analyzer",e);
}
if(input != null){
try {
props.loadFromXML(input);
} catch (InvalidPropertiesFormatException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
}
public List<String> getExtDictionarys(){
List<String> extDictFiles = new ArrayList<String>(2);
String extDictCfg = props.getProperty(EXT_DICT);
if(extDictCfg != null){
String[] filePaths = extDictCfg.split(";");
if(filePaths != null){
for(String filePath : filePaths){
if(filePath != null && !"".equals(filePath.trim())){
File file=new File("ik",filePath.trim());
extDictFiles.add(file.toString());
}
}
}
}
return extDictFiles;
}
public List<String> getExtStopWordDictionarys(){
List<String> extStopWordDictFiles = new ArrayList<String>(2);
String extStopWordDictCfg = props.getProperty(EXT_STOP);
if(extStopWordDictCfg != null){
String[] filePaths = extStopWordDictCfg.split(";");
if(filePaths != null){
for(String filePath : filePaths){
if(filePath != null && !"".equals(filePath.trim())){
File file=new File("ik",filePath.trim());
extStopWordDictFiles.add(file.toString());
}
}
}
}
return extStopWordDictFiles;
}
public File getDictRoot() {
return environment.configFile();
}
}
......@@ -59,7 +59,7 @@ public final class IKSegmenter {
public IKSegmenter(Reader input , Settings settings, Environment environment){
this.input = input;
this.cfg = new Configuration(environment);
this.useSmart = settings.get("use_smart", "true").equals("true");
this.useSmart = settings.get("use_smart", "false").equals("true");
this.init();
}
......
/**
* IK 中文分词 版本 5.0.1
* IK Analyzer release 5.0.1
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* 源代码由林良益(linliangyi2005@gmail.com)提供
* 版权声明 2012,乌龙茶工作室
* provided by Linliangyi and copyright 2012 by Oolong studio
*
*/
package org.wltea.analyzer.lucene;
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Tokenizer;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
/**
* IK分词器,Lucene Analyzer接口实现
* 兼容Lucene 4.0版本
*/
public final class IKAnalyzer extends Analyzer{
private boolean useSmart;
public boolean useSmart() {
return useSmart;
}
public void setUseSmart(boolean useSmart) {
this.useSmart = useSmart;
}
/**
* IK分词器Lucene Analyzer接口实现类
*
* 默认细粒度切分算法
*/
public IKAnalyzer(){
this(false);
}
/**
* IK分词器Lucene Analyzer接口实现类
*
* @param useSmart 当为true时,分词器进行智能切分
*/
public IKAnalyzer(boolean useSmart){
super();
this.useSmart = useSmart;
}
Settings settings;
Environment environment;
public IKAnalyzer(Settings indexSetting,Settings settings, Environment environment) {
super();
this.settings=settings;
this.environment= environment;
}
/**
* 重载Analyzer接口,构造分词组件
*/
@Override
protected TokenStreamComponents createComponents(String fieldName, final Reader in) {
Tokenizer _IKTokenizer = new IKTokenizer(in , settings, environment);
return new TokenStreamComponents(_IKTokenizer);
}
}
/**
* IK 中文分词 版本 5.0.1
* IK Analyzer release 5.0.1
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* 源代码由林良益(linliangyi2005@gmail.com)提供
* 版权声明 2012,乌龙茶工作室
* provided by Linliangyi and copyright 2012 by Oolong studio
*
*/
package org.wltea.analyzer.lucene;
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Tokenizer;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
/**
* IK分词器,Lucene Analyzer接口实现
* 兼容Lucene 4.0版本
*/
public final class IKAnalyzer extends Analyzer{
private boolean useSmart;
public boolean useSmart() {
return useSmart;
}
public void setUseSmart(boolean useSmart) {
this.useSmart = useSmart;
}
/**
* IK分词器Lucene Analyzer接口实现类
*
* 默认细粒度切分算法
*/
public IKAnalyzer(){
this(false);
}
/**
* IK分词器Lucene Analyzer接口实现类
*
* @param useSmart 当为true时,分词器进行智能切分
*/
public IKAnalyzer(boolean useSmart){
super();
this.useSmart = useSmart;
}
Settings settings;
Environment environment;
public IKAnalyzer(Settings indexSetting,Settings settings, Environment environment) {
super();
this.settings=settings;
this.environment= environment;
}
/**
* 重载Analyzer接口,构造分词组件
*/
@Override
protected TokenStreamComponents createComponents(String fieldName, final Reader in) {
Tokenizer _IKTokenizer = new IKTokenizer(in , settings, environment);
return new TokenStreamComponents(_IKTokenizer);
}
}
/**
* IK 中文分词 版本 5.0.1
* IK Analyzer release 5.0.1
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* 源代码由林良益(linliangyi2005@gmail.com)提供
* 版权声明 2012,乌龙茶工作室
* provided by Linliangyi and copyright 2012 by Oolong studio
*
*
*/
package org.wltea.analyzer.lucene;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.wltea.analyzer.core.IKSegmenter;
import org.wltea.analyzer.core.Lexeme;
import java.io.IOException;
import java.io.Reader;
/**
* IK分词器 Lucene Tokenizer适配器类
* 兼容Lucene 4.0版本
*/
public final class IKTokenizer extends Tokenizer {
//IK分词器实现
private IKSegmenter _IKImplement;
//词元文本属性
private final CharTermAttribute termAtt;
//词元位移属性
private final OffsetAttribute offsetAtt;
//词元分类属性(该属性分类参考org.wltea.analyzer.core.Lexeme中的分类常量)
private final TypeAttribute typeAtt;
//记录最后一个词元的结束位置
private int endPosition;
/**
* Lucene 4.0 Tokenizer适配器类构造函数
* @param in
*/
public IKTokenizer(Reader in , Settings settings, Environment environment){
super(in);
offsetAtt = addAttribute(OffsetAttribute.class);
termAtt = addAttribute(CharTermAttribute.class);
typeAtt = addAttribute(TypeAttribute.class);
_IKImplement = new IKSegmenter(input , settings, environment);
}
/* (non-Javadoc)
* @see org.apache.lucene.analysis.TokenStream#incrementToken()
*/
@Override
public boolean incrementToken() throws IOException {
//清除所有的词元属性
clearAttributes();
Lexeme nextLexeme = _IKImplement.next();
if(nextLexeme != null){
//将Lexeme转成Attributes
//设置词元文本
termAtt.append(nextLexeme.getLexemeText().toLowerCase());
//设置词元长度
termAtt.setLength(nextLexeme.getLength());
//设置词元位移
offsetAtt.setOffset(nextLexeme.getBeginPosition(), nextLexeme.getEndPosition());
//记录分词的最后位置
endPosition = nextLexeme.getEndPosition();
//记录词元分类
typeAtt.setType(nextLexeme.getLexemeTypeString());
//返会true告知还有下个词元
return true;
}
//返会false告知词元输出完毕
return false;
}
/*
* (non-Javadoc)
* @see org.apache.lucene.analysis.Tokenizer#reset(java.io.Reader)
*/
@Override
public void reset() throws IOException {
super.reset();
_IKImplement.reset(input);
}
@Override
public final void end() {
// set final offset
int finalOffset = correctOffset(this.endPosition);
offsetAtt.setOffset(finalOffset, finalOffset);
}
}
/**
* IK 中文分词 版本 5.0.1
* IK Analyzer release 5.0.1
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* 源代码由林良益(linliangyi2005@gmail.com)提供
* 版权声明 2012,乌龙茶工作室
* provided by Linliangyi and copyright 2012 by Oolong studio
*
*
*/
package org.wltea.analyzer.lucene;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.wltea.analyzer.core.IKSegmenter;
import org.wltea.analyzer.core.Lexeme;
import java.io.IOException;
import java.io.Reader;
/**
* IK分词器 Lucene Tokenizer适配器类
* 兼容Lucene 4.0版本
*/
public final class IKTokenizer extends Tokenizer {
//IK分词器实现
private IKSegmenter _IKImplement;
//词元文本属性
private final CharTermAttribute termAtt;
//词元位移属性
private final OffsetAttribute offsetAtt;
//词元分类属性(该属性分类参考org.wltea.analyzer.core.Lexeme中的分类常量)
private final TypeAttribute typeAtt;
//记录最后一个词元的结束位置
private int endPosition;
/**
* Lucene 4.0 Tokenizer适配器类构造函数
* @param in
*/
public IKTokenizer(Reader in , Settings settings, Environment environment){
super(in);
offsetAtt = addAttribute(OffsetAttribute.class);
termAtt = addAttribute(CharTermAttribute.class);
typeAtt = addAttribute(TypeAttribute.class);
_IKImplement = new IKSegmenter(input , settings, environment);
}
/* (non-Javadoc)
* @see org.apache.lucene.analysis.TokenStream#incrementToken()
*/
@Override
public boolean incrementToken() throws IOException {
//清除所有的词元属性
clearAttributes();
Lexeme nextLexeme = _IKImplement.next();
if(nextLexeme != null){
//将Lexeme转成Attributes
//设置词元文本
termAtt.append(nextLexeme.getLexemeText().toLowerCase());
//设置词元长度
termAtt.setLength(nextLexeme.getLength());
//设置词元位移
offsetAtt.setOffset(nextLexeme.getBeginPosition(), nextLexeme.getEndPosition());
//记录分词的最后位置
endPosition = nextLexeme.getEndPosition();
//记录词元分类
typeAtt.setType(nextLexeme.getLexemeTypeString());
//返会true告知还有下个词元
return true;
}
//返会false告知词元输出完毕
return false;
}
/*
* (non-Javadoc)
* @see org.apache.lucene.analysis.Tokenizer#reset(java.io.Reader)
*/
@Override
public void reset() throws IOException {
super.reset();
_IKImplement.reset(input);
}
@Override
public final void end() {
// set final offset
int finalOffset = correctOffset(this.endPosition);
offsetAtt.setOffset(finalOffset, finalOffset);
}
}
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册