当前位置: 首页>数据库>正文

IK分词器

下载
elasticsearch:https://www.elastic.co/cn/elasticsearch
elasticsearch-analysis-ik:https://github.com/medcl/elasticsearch-analysis-ik


将 elasticsearch-analysis-ik 解压至 elasticsearch 的 plugins 目录下
启动 elasticsearch:elasticsearch\bin\elasticsearch.bat http://localhost:9200
启动 kibana:kibana\bin\kibana.bat http://localhost:5601

开发工具 - 控制台
最少切分

GET _analyze
{
  "analyzer": "ik_smart",
  "text": ["好好学习"]
}
// 分词结果
{
  "tokens" : [
    {
      "token" : "好好学习",
      "start_offset" : 0,
      "end_offset" : 4,
      "type" : "CN_WORD",
      "position" : 0
    }
  ]
}

最细粒度划分

GET _analyze
{
  "analyzer": "ik_max_word",
  "text": ["好好学习"]
}
// 分词结果
{
  "tokens" : [
    {
      "token" : "好好学习",
      "start_offset" : 0,
      "end_offset" : 4,
      "type" : "CN_WORD",
      "position" : 0
    },
    {
      "token" : "好好学",
      "start_offset" : 0,
      "end_offset" : 3,
      "type" : "CN_WORD",
      "position" : 1
    },
    {
      "token" : "好好",
      "start_offset" : 0,
      "end_offset" : 2,
      "type" : "CN_WORD",
      "position" : 2
    },
    {
      "token" : "好学",
      "start_offset" : 1,
      "end_offset" : 3,
      "type" : "CN_WORD",
      "position" : 3
    },
    {
      "token" : "学习",
      "start_offset" : 2,
      "end_offset" : 4,
      "type" : "CN_WORD",
      "position" : 4
    }
  ]
}

自定义配置分词字典
elasticsearch-7.6.2\plugins\elasticsearch-analysis-ik-7.6.2\config\IKAnalyzer.cfg.xml

<!--新建 dic文件录入分词,在IKAnalyzer.cfg.xml中配置-->
<entry key="ext_dict">春眠不觉晓.dic</entry>
GET _analyze
{
  "analyzer": "ik_max_word",
  "text": ["春眠不觉晓"]
}
// 自定义分词前
{
  "tokens" : [
    {
      "token" : "春",
      "start_offset" : 0,
      "end_offset" : 1,
      "type" : "CN_CHAR",
      "position" : 0
    },
    {
      "token" : "眠",
      "start_offset" : 1,
      "end_offset" : 2,
      "type" : "CN_CHAR",
      "position" : 1
    },
    {
      "token" : "不觉",
      "start_offset" : 2,
      "end_offset" : 4,
      "type" : "CN_WORD",
      "position" : 2
    },
    {
      "token" : "晓",
      "start_offset" : 4,
      "end_offset" : 5,
      "type" : "CN_CHAR",
      "position" : 3
    }
  ]
}
// 自定义分词后
{
  "tokens" : [
    {
      "token" : "春眠不觉晓",
      "start_offset" : 0,
      "end_offset" : 5,
      "type" : "CN_WORD",
      "position" : 0
    },
    {
      "token" : "不觉",
      "start_offset" : 2,
      "end_offset" : 4,
      "type" : "CN_WORD",
      "position" : 1
    },
    {
      "token" : "晓",
      "start_offset" : 4,
      "end_offset" : 5,
      "type" : "CN_CHAR",
      "position" : 2
    }
  ]
}

https://www.xamrdz.com/database/6ym1848901.html

相关文章: