kuromoji

created at 2020-08-03 21:51+0900

atilika/kuromoji をビルドする

atilika/kuromojiはJavaで書かれた日本語形態素解析ツール。本家サイトからソースをゲットだぜ。

git clone https://github.com/atilika/kuromoji

使いたい辞書のプロジェクトとCoreのプロジェクトをそれぞれビルドする。

mvn clean package kuromoji-unidic-neologd
mvn clean package kuromoji-core

atilika/kuromoji で形態素解析する

Spring Boot で gradle プロジェクトを作る

上記でビルドしたjarを適当な場所に置いて、依存関係を設定する。

plugins {
    id 'org.springframework.boot' version '2.3.2.RELEASE'
    id 'io.spring.dependency-management' version '1.0.9.RELEASE'
    id 'java'
}

group = 'net.r_square'
version = '0.0.1-SNAPSHOT'
sourceCompatibility = '11'

repositories {
    mavenCentral()
}

dependencies {
    implementation 'org.springframework.boot:spring-boot-starter-data-redis'
    implementation 'org.springframework.boot:spring-boot-starter-web'
    testImplementation('org.springframework.boot:spring-boot-starter-test') {
        exclude group: 'org.junit.vintage', module: 'junit-vintage-engine'
    }
    implementation fileTree(dir: 'libs', include: 'kuromoji-*.jar') // <---- ここ
}

test {
    useJUnitPlatform()
}

形態素解析サービスを作る

Tokenizerクラスのtokenizeメソッドを呼び出す。

// import文は一部省略
import com.atilika.kuromoji.unidic.neologd.Token;
import com.atilika.kuromoji.unidic.neologd.Tokenizer;

@Service
public class NlpServiceImpl implements NlpService {
    private static final Logger logger = LoggerFactory.getLogger(NlpServiceImpl.class);
    @Override
    public List<Token> tokenize(String sentence) {
        Tokenizer tokenizer = new Tokenizer();
        List<Token> tokens = tokenizer.tokenize(sentence);

        int tokenSize = tokens.size();
        for (int i = 0; i < tokenSize; i++) {
            Token token = tokens.get(i);
            logger.debug(String.format("%s \t %s", token.getSurface(), token.getAllFeatures()));
        }
        return tokens;
    }

Spring Boot でRest APIを作る

形態素解析メソッドを呼び出すAPIを作る。

// import 文は省略
@RestController
@RequestMapping("/textgen/api")
public class TextGenController {

    @Autowired
    protected NlpService nlpService;
    @GetMapping("/token")
    @ResponseBody
    public ResponseEntity<List<Token>> token(@RequestParam(name = "s") String sentence) {
        List<Token> res = nlpService.tokenize(sentence);
        return new ResponseEntity<List<Token>>(res, HttpStatus.OK);
    }
}

Http Client で呼び出す

解析したい文章。句点(。)を含む固有名詞がある。意地悪な文章。

今夜のミュージックフェアは、E-girlsとモーニング娘。とおニャン子クラブの夢の共演です。始まります。

$ curl --get http://localhost:8080/textgen/api/token --data-urlencode 's=今夜のミュージックフェアは、E-girlsとモーニング娘。とおニャン子クラブの夢の共演です。始まります。' | jq
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 15412    0 15412    0     0   5515      0 --:--:--  0:00:02 --:--:--  5514
[
  {
    "surface": "今夜",
    "position": 0,
    "conjugationType": "*",
    "lemmaReadingForm": "コンヤ",
    "pronunciation": "コンヤ",
    "pronunciationBaseForm": "コンヤ",
    "writtenForm": "今夜",
    "writtenBaseForm": "今夜",
    "languageType": "漢",
    "initialSoundAlterationType": "*",
    "initialSoundAlterationForm": "*",
    "finalSoundAlterationType": "*",
    "finalSoundAlterationForm": "*",
    "lemma": "今夜",
    "partOfSpeechLevel1": "名詞",
    "partOfSpeechLevel2": "普通名詞",
    "partOfSpeechLevel3": "副詞可能",
    "partOfSpeechLevel4": "*",
    "conjugationForm": "*",
    "known": true,
    "allFeatures": "名詞,普通名詞,副詞可能,*,*,*,コンヤ,今夜,今夜,コンヤ,今夜,コンヤ,漢,*,*,*,*",
    "user": false,
    "allFeaturesArray": [
      "名詞",
      "普通名詞",
      "副詞可能",
      "*",
      "*",
      "*",
      "コンヤ",
      "今夜",
      "今夜",
      "コンヤ",
      "今夜",
      "コンヤ",
      "漢",
      "*",
      "*",
      "*",
      "*"
    ]
  },
  {
    "surface": "の",
    "position": 2,
    "conjugationType": "*",
    "lemmaReadingForm": "ノ",
    "pronunciation": "ノ",
    "pronunciationBaseForm": "ノ",
    "writtenForm": "の",
    "writtenBaseForm": "の",
    "languageType": "和",
    "initialSoundAlterationType": "*",
    "initialSoundAlterationForm": "*",
    "finalSoundAlterationType": "*",
    "finalSoundAlterationForm": "*",
    "lemma": "の",
    "partOfSpeechLevel1": "助詞",
    "partOfSpeechLevel2": "格助詞",
    "partOfSpeechLevel3": "*",
    "partOfSpeechLevel4": "*",
    "conjugationForm": "*",
    "known": true,
    "allFeatures": "助詞,格助詞,*,*,*,*,ノ,の,の,ノ,の,ノ,和,*,*,*,*",
    "user": false,
    "allFeaturesArray": [
      "助詞",
      "格助詞",
      "*",
      "*",
      "*",
      "*",
      "ノ",
      "の",
      "の",
      "ノ",
      "の",
      "ノ",
      "和",
      "*",
      "*",
      "*",
      "*"
    ]
  },
  {
    "surface": "ミュージックフェア",
    "position": 3,
    "conjugationType": "*",
    "lemmaReadingForm": "ミュージックフェア",
    "pronunciation": "ミュージックフェア",
    "pronunciationBaseForm": "ミュージックフェア",
    "writtenForm": "ミュージックフェア",
    "writtenBaseForm": "ミュージックフェア",
    "languageType": "固",
    "initialSoundAlterationType": "*",
    "initialSoundAlterationForm": "*",
    "finalSoundAlterationType": "*",
    "finalSoundAlterationForm": "*",
    "lemma": "MUSIC FAIR",
    "partOfSpeechLevel1": "名詞",
    "partOfSpeechLevel2": "固有名詞",
    "partOfSpeechLevel3": "一般",
    "partOfSpeechLevel4": "*",
    "conjugationForm": "*",
    "known": true,
    "allFeatures": "名詞,固有名詞,一般,*,*,*,ミュージックフェア,MUSIC FAIR,ミュージックフェア,ミュージックフェア,ミュージックフェア,ミュージックフェア,固,*,*,*,*",
    "user": false,
    "allFeaturesArray": [
      "名詞",
      "固有名詞",
      "一般",
      "*",
      "*",
      "*",
      "ミュージックフェア",
      "MUSIC FAIR",
      "ミュージックフェア",
      "ミュージックフェア",
      "ミュージックフェア",
      "ミュージックフェア",
      "固",
      "*",
      "*",
      "*",
      "*"
    ]
  },
  {
    "surface": "は",
    "position": 12,
    "conjugationType": "*",
    "lemmaReadingForm": "ハ",
    "pronunciation": "ワ",
    "pronunciationBaseForm": "ワ",
    "writtenForm": "は",
    "writtenBaseForm": "は",
    "languageType": "和",
    "initialSoundAlterationType": "*",
    "initialSoundAlterationForm": "*",
    "finalSoundAlterationType": "*",
    "finalSoundAlterationForm": "*",
    "lemma": "は",
    "partOfSpeechLevel1": "助詞",
    "partOfSpeechLevel2": "係助詞",
    "partOfSpeechLevel3": "*",
    "partOfSpeechLevel4": "*",
    "conjugationForm": "*",
    "known": true,
    "allFeatures": "助詞,係助詞,*,*,*,*,ハ,は,は,ワ,は,ワ,和,*,*,*,*",
    "user": false,
    "allFeaturesArray": [
      "助詞",
      "係助詞",
      "*",
      "*",
      "*",
      "*",
      "ハ",
      "は",
      "は",
      "ワ",
      "は",
      "ワ",
      "和",
      "*",
      "*",
      "*",
      "*"
    ]
  },
  {
    "surface": "、",
    "position": 13,
    "conjugationType": "*",
    "lemmaReadingForm": "",
    "pronunciation": "",
    "pronunciationBaseForm": "",
    "writtenForm": "、",
    "writtenBaseForm": "、",
    "languageType": "記号",
    "initialSoundAlterationType": "*",
    "initialSoundAlterationForm": "*",
    "finalSoundAlterationType": "*",
    "finalSoundAlterationForm": "*",
    "lemma": "、",
    "partOfSpeechLevel1": "補助記号",
    "partOfSpeechLevel2": "読点",
    "partOfSpeechLevel3": "*",
    "partOfSpeechLevel4": "*",
    "conjugationForm": "*",
    "known": true,
    "allFeatures": "補助記号,読点,*,*,*,*,,、,、,,、,,記号,*,*,*,*",
    "user": false,
    "allFeaturesArray": [
      "補助記号",
      "読点",
      "*",
      "*",
      "*",
      "*",
      "",
      "、",
      "、",
      "",
      "、",
      "",
      "記号",
      "*",
      "*",
      "*",
      "*"
    ]
  },
  {
    "surface": "E-girls",
    "position": 14,
    "conjugationType": "*",
    "lemmaReadingForm": "イーガールズ",
    "pronunciation": "イーガールズ",
    "pronunciationBaseForm": "イーガールズ",
    "writtenForm": "E-girls",
    "writtenBaseForm": "E-girls",
    "languageType": "固",
    "initialSoundAlterationType": "*",
    "initialSoundAlterationForm": "*",
    "finalSoundAlterationType": "*",
    "finalSoundAlterationForm": "*",
    "lemma": "E-girls",
    "partOfSpeechLevel1": "名詞",
    "partOfSpeechLevel2": "固有名詞",
    "partOfSpeechLevel3": "一般",
    "partOfSpeechLevel4": "*",
    "conjugationForm": "*",
    "known": true,
    "allFeatures": "名詞,固有名詞,一般,*,*,*,イーガールズ,E-girls,E-girls,イーガールズ,E-girls,イーガールズ,固,*,*,*,*",
    "user": false,
    "allFeaturesArray": [
      "名詞",
      "固有名詞",
      "一般",
      "*",
      "*",
      "*",
      "イーガールズ",
      "E-girls",
      "E-girls",
      "イーガールズ",
      "E-girls",
      "イーガールズ",
      "固",
      "*",
      "*",
      "*",
      "*"
    ]
  },
  {
    "surface": "と",
    "position": 21,
    "conjugationType": "*",
    "lemmaReadingForm": "ト",
    "pronunciation": "ト",
    "pronunciationBaseForm": "ト",
    "writtenForm": "と",
    "writtenBaseForm": "と",
    "languageType": "和",
    "initialSoundAlterationType": "*",
    "initialSoundAlterationForm": "*",
    "finalSoundAlterationType": "*",
    "finalSoundAlterationForm": "*",
    "lemma": "と",
    "partOfSpeechLevel1": "助詞",
    "partOfSpeechLevel2": "格助詞",
    "partOfSpeechLevel3": "*",
    "partOfSpeechLevel4": "*",
    "conjugationForm": "*",
    "known": true,
    "allFeatures": "助詞,格助詞,*,*,*,*,ト,と,と,ト,と,ト,和,*,*,*,*",
    "user": false,
    "allFeaturesArray": [
      "助詞",
      "格助詞",
      "*",
      "*",
      "*",
      "*",
      "ト",
      "と",
      "と",
      "ト",
      "と",
      "ト",
      "和",
      "*",
      "*",
      "*",
      "*"
    ]
  },
  {
    "surface": "モーニング娘。",
    "position": 22,
    "conjugationType": "*",
    "lemmaReadingForm": "モーニングムスメ",
    "pronunciation": "モーニングムスメ",
    "pronunciationBaseForm": "モーニングムスメ",
    "writtenForm": "モーニング娘。",
    "writtenBaseForm": "モーニング娘。",
    "languageType": "固",
    "initialSoundAlterationType": "*",
    "initialSoundAlterationForm": "*",
    "finalSoundAlterationType": "*",
    "finalSoundAlterationForm": "*",
    "lemma": "モーニング娘。",
    "partOfSpeechLevel1": "名詞",
    "partOfSpeechLevel2": "固有名詞",
    "partOfSpeechLevel3": "人名",
    "partOfSpeechLevel4": "一般",
    "conjugationForm": "*",
    "known": true,
    "allFeatures": "名詞,固有名詞,人名,一般,*,*,モーニングムスメ,モーニング娘。,モーニング娘。,モーニングムスメ,モーニング娘。,モーニングムスメ,固,*,*,*,*",
    "user": false,
    "allFeaturesArray": [
      "名詞",
      "固有名詞",
      "人名",
      "一般",
      "*",
      "*",
      "モーニングムスメ",
      "モーニング娘。",
      "モーニング娘。",
      "モーニングムスメ",
      "モーニング娘。",
      "モーニングムスメ",
      "固",
      "*",
      "*",
      "*",
      "*"
    ]
  },
  {
    "surface": "と",
    "position": 29,
    "conjugationType": "*",
    "lemmaReadingForm": "ト",
    "pronunciation": "ト",
    "pronunciationBaseForm": "ト",
    "writtenForm": "と",
    "writtenBaseForm": "と",
    "languageType": "和",
    "initialSoundAlterationType": "*",
    "initialSoundAlterationForm": "*",
    "finalSoundAlterationType": "*",
    "finalSoundAlterationForm": "*",
    "lemma": "と",
    "partOfSpeechLevel1": "助詞",
    "partOfSpeechLevel2": "格助詞",
    "partOfSpeechLevel3": "*",
    "partOfSpeechLevel4": "*",
    "conjugationForm": "*",
    "known": true,
    "allFeatures": "助詞,格助詞,*,*,*,*,ト,と,と,ト,と,ト,和,*,*,*,*",
    "user": false,
    "allFeaturesArray": [
      "助詞",
      "格助詞",
      "*",
      "*",
      "*",
      "*",
      "ト",
      "と",
      "と",
      "ト",
      "と",
      "ト",
      "和",
      "*",
      "*",
      "*",
      "*"
    ]
  },
  {
    "surface": "おニャン子クラブ",
    "position": 30,
    "conjugationType": "*",
    "lemmaReadingForm": "オニャンコクラブ",
    "pronunciation": "オニャンコクラブ",
    "pronunciationBaseForm": "オニャンコクラブ",
    "writtenForm": "おニャン子クラブ",
    "writtenBaseForm": "おニャン子クラブ",
    "languageType": "固",
    "initialSoundAlterationType": "*",
    "initialSoundAlterationForm": "*",
    "finalSoundAlterationType": "*",
    "finalSoundAlterationForm": "*",
    "lemma": "おニャン子クラブ",
    "partOfSpeechLevel1": "名詞",
    "partOfSpeechLevel2": "固有名詞",
    "partOfSpeechLevel3": "一般",
    "partOfSpeechLevel4": "*",
    "conjugationForm": "*",
    "known": true,
    "allFeatures": "名詞,固有名詞,一般,*,*,*,オニャンコクラブ,おニャン子クラブ,おニャン子クラブ,オニャンコクラブ,おニャン子クラブ,オニャンコクラブ,固,*,*,*,*",
    "user": false,
    "allFeaturesArray": [
      "名詞",
      "固有名詞",
      "一般",
      "*",
      "*",
      "*",
      "オニャンコクラブ",
      "おニャン子クラブ",
      "おニャン子クラブ",
      "オニャンコクラブ",
      "おニャン子クラブ",
      "オニャンコクラブ",
      "固",
      "*",
      "*",
      "*",
      "*"
    ]
  },
  {
    "surface": "の",
    "position": 38,
    "conjugationType": "*",
    "lemmaReadingForm": "ノ",
    "pronunciation": "ノ",
    "pronunciationBaseForm": "ノ",
    "writtenForm": "の",
    "writtenBaseForm": "の",
    "languageType": "和",
    "initialSoundAlterationType": "*",
    "initialSoundAlterationForm": "*",
    "finalSoundAlterationType": "*",
    "finalSoundAlterationForm": "*",
    "lemma": "の",
    "partOfSpeechLevel1": "助詞",
    "partOfSpeechLevel2": "格助詞",
    "partOfSpeechLevel3": "*",
    "partOfSpeechLevel4": "*",
    "conjugationForm": "*",
    "known": true,
    "allFeatures": "助詞,格助詞,*,*,*,*,ノ,の,の,ノ,の,ノ,和,*,*,*,*",
    "user": false,
    "allFeaturesArray": [
      "助詞",
      "格助詞",
      "*",
      "*",
      "*",
      "*",
      "ノ",
      "の",
      "の",
      "ノ",
      "の",
      "ノ",
      "和",
      "*",
      "*",
      "*",
      "*"
    ]
  },
  {
    "surface": "夢",
    "position": 39,
    "conjugationType": "*",
    "lemmaReadingForm": "ユメ",
    "pronunciation": "ユメ",
    "pronunciationBaseForm": "ユメ",
    "writtenForm": "夢",
    "writtenBaseForm": "夢",
    "languageType": "和",
    "initialSoundAlterationType": "*",
    "initialSoundAlterationForm": "*",
    "finalSoundAlterationType": "*",
    "finalSoundAlterationForm": "*",
    "lemma": "夢",
    "partOfSpeechLevel1": "名詞",
    "partOfSpeechLevel2": "普通名詞",
    "partOfSpeechLevel3": "一般",
    "partOfSpeechLevel4": "*",
    "conjugationForm": "*",
    "known": true,
    "allFeatures": "名詞,普通名詞,一般,*,*,*,ユメ,夢,夢,ユメ,夢,ユメ,和,*,*,*,*",
    "user": false,
    "allFeaturesArray": [
      "名詞",
      "普通名詞",
      "一般",
      "*",
      "*",
      "*",
      "ユメ",
      "夢",
      "夢",
      "ユメ",
      "夢",
      "ユメ",
      "和",
      "*",
      "*",
      "*",
      "*"
    ]
  },
  {
    "surface": "の",
    "position": 40,
    "conjugationType": "*",
    "lemmaReadingForm": "ノ",
    "pronunciation": "ノ",
    "pronunciationBaseForm": "ノ",
    "writtenForm": "の",
    "writtenBaseForm": "の",
    "languageType": "和",
    "initialSoundAlterationType": "*",
    "initialSoundAlterationForm": "*",
    "finalSoundAlterationType": "*",
    "finalSoundAlterationForm": "*",
    "lemma": "の",
    "partOfSpeechLevel1": "助詞",
    "partOfSpeechLevel2": "格助詞",
    "partOfSpeechLevel3": "*",
    "partOfSpeechLevel4": "*",
    "conjugationForm": "*",
    "known": true,
    "allFeatures": "助詞,格助詞,*,*,*,*,ノ,の,の,ノ,の,ノ,和,*,*,*,*",
    "user": false,
    "allFeaturesArray": [
      "助詞",
      "格助詞",
      "*",
      "*",
      "*",
      "*",
      "ノ",
      "の",
      "の",
      "ノ",
      "の",
      "ノ",
      "和",
      "*",
      "*",
      "*",
      "*"
    ]
  },
  {
    "surface": "共演",
    "position": 41,
    "conjugationType": "*",
    "lemmaReadingForm": "キョウエン",
    "pronunciation": "キョーエン",
    "pronunciationBaseForm": "キョーエン",
    "writtenForm": "共演",
    "writtenBaseForm": "共演",
    "languageType": "漢",
    "initialSoundAlterationType": "*",
    "initialSoundAlterationForm": "*",
    "finalSoundAlterationType": "*",
    "finalSoundAlterationForm": "*",
    "lemma": "共演",
    "partOfSpeechLevel1": "名詞",
    "partOfSpeechLevel2": "普通名詞",
    "partOfSpeechLevel3": "サ変可能",
    "partOfSpeechLevel4": "*",
    "conjugationForm": "*",
    "known": true,
    "allFeatures": "名詞,普通名詞,サ変可能,*,*,*,キョウエン,共演,共演,キョーエン,共演,キョーエン,漢,*,*,*,*",
    "user": false,
    "allFeaturesArray": [
      "名詞",
      "普通名詞",
      "サ変可能",
      "*",
      "*",
      "*",
      "キョウエン",
      "共演",
      "共演",
      "キョーエン",
      "共演",
      "キョーエン",
      "漢",
      "*",
      "*",
      "*",
      "*"
    ]
  },
  {
    "surface": "です",
    "position": 43,
    "conjugationType": "助動詞-デス",
    "lemmaReadingForm": "デス",
    "pronunciation": "デス",
    "pronunciationBaseForm": "デス",
    "writtenForm": "です",
    "writtenBaseForm": "です",
    "languageType": "和",
    "initialSoundAlterationType": "*",
    "initialSoundAlterationForm": "*",
    "finalSoundAlterationType": "*",
    "finalSoundAlterationForm": "*",
    "lemma": "です",
    "partOfSpeechLevel1": "助動詞",
    "partOfSpeechLevel2": "*",
    "partOfSpeechLevel3": "*",
    "partOfSpeechLevel4": "*",
    "conjugationForm": "終止形-一般",
    "known": true,
    "allFeatures": "助動詞,*,*,*,助動詞-デス,終止形-一般,デス,です,です,デス,です,デス,和,*,*,*,*",
    "user": false,
    "allFeaturesArray": [
      "助動詞",
      "*",
      "*",
      "*",
      "助動詞-デス",
      "終止形-一般",
      "デス",
      "です",
      "です",
      "デス",
      "です",
      "デス",
      "和",
      "*",
      "*",
      "*",
      "*"
    ]
  },
  {
    "surface": "。",
    "position": 45,
    "conjugationType": "*",
    "lemmaReadingForm": "",
    "pronunciation": "",
    "pronunciationBaseForm": "",
    "writtenForm": "。",
    "writtenBaseForm": "。",
    "languageType": "記号",
    "initialSoundAlterationType": "*",
    "initialSoundAlterationForm": "*",
    "finalSoundAlterationType": "*",
    "finalSoundAlterationForm": "*",
    "lemma": "。",
    "partOfSpeechLevel1": "補助記号",
    "partOfSpeechLevel2": "句点",
    "partOfSpeechLevel3": "*",
    "partOfSpeechLevel4": "*",
    "conjugationForm": "*",
    "known": true,
    "allFeatures": "補助記号,句点,*,*,*,*,,。,。,,。,,記号,*,*,*,*",
    "user": false,
    "allFeaturesArray": [
      "補助記号",
      "句点",
      "*",
      "*",
      "*",
      "*",
      "",
      "。",
      "。",
      "",
      "。",
      "",
      "記号",
      "*",
      "*",
      "*",
      "*"
    ]
  },
  {
    "surface": "始まり",
    "position": 46,
    "conjugationType": "五段-ラ行",
    "lemmaReadingForm": "ハジマル",
    "pronunciation": "ハジマリ",
    "pronunciationBaseForm": "ハジマル",
    "writtenForm": "始まり",
    "writtenBaseForm": "始まる",
    "languageType": "和",
    "initialSoundAlterationType": "*",
    "initialSoundAlterationForm": "*",
    "finalSoundAlterationType": "*",
    "finalSoundAlterationForm": "*",
    "lemma": "始まる",
    "partOfSpeechLevel1": "動詞",
    "partOfSpeechLevel2": "一般",
    "partOfSpeechLevel3": "*",
    "partOfSpeechLevel4": "*",
    "conjugationForm": "連用形-一般",
    "known": true,
    "allFeatures": "動詞,一般,*,*,五段-ラ行,連用形-一般,ハジマル,始まる,始まり,ハジマリ,始まる,ハジマル,和,*,*,*,*",
    "user": false,
    "allFeaturesArray": [
      "動詞",
      "一般",
      "*",
      "*",
      "五段-ラ行",
      "連用形-一般",
      "ハジマル",
      "始まる",
      "始まり",
      "ハジマリ",
      "始まる",
      "ハジマル",
      "和",
      "*",
      "*",
      "*",
      "*"
    ]
  },
  {
    "surface": "ます",
    "position": 49,
    "conjugationType": "助動詞-マス",
    "lemmaReadingForm": "マス",
    "pronunciation": "マス",
    "pronunciationBaseForm": "マス",
    "writtenForm": "ます",
    "writtenBaseForm": "ます",
    "languageType": "和",
    "initialSoundAlterationType": "*",
    "initialSoundAlterationForm": "*",
    "finalSoundAlterationType": "*",
    "finalSoundAlterationForm": "*",
    "lemma": "ます",
    "partOfSpeechLevel1": "助動詞",
    "partOfSpeechLevel2": "*",
    "partOfSpeechLevel3": "*",
    "partOfSpeechLevel4": "*",
    "conjugationForm": "終止形-一般",
    "known": true,
    "allFeatures": "助動詞,*,*,*,助動詞-マス,終止形-一般,マス,ます,ます,マス,ます,マス,和,*,*,*,*",
    "user": false,
    "allFeaturesArray": [
      "助動詞",
      "*",
      "*",
      "*",
      "助動詞-マス",
      "終止形-一般",
      "マス",
      "ます",
      "ます",
      "マス",
      "ます",
      "マス",
      "和",
      "*",
      "*",
      "*",
      "*"
    ]
  },
  {
    "surface": "。",
    "position": 51,
    "conjugationType": "*",
    "lemmaReadingForm": "",
    "pronunciation": "",
    "pronunciationBaseForm": "",
    "writtenForm": "。",
    "writtenBaseForm": "。",
    "languageType": "記号",
    "initialSoundAlterationType": "*",
    "initialSoundAlterationForm": "*",
    "finalSoundAlterationType": "*",
    "finalSoundAlterationForm": "*",
    "lemma": "。",
    "partOfSpeechLevel1": "補助記号",
    "partOfSpeechLevel2": "句点",
    "partOfSpeechLevel3": "*",
    "partOfSpeechLevel4": "*",
    "conjugationForm": "*",
    "known": true,
    "allFeatures": "補助記号,句点,*,*,*,*,,。,。,,。,,記号,*,*,*,*",
    "user": false,
    "allFeaturesArray": [
      "補助記号",
      "句点",
      "*",
      "*",
      "*",
      "*",
      "",
      "。",
      "。",
      "",
      "。",
      "",
      "記号",
      "*",
      "*",
      "*",
      "*"
    ]
  }
]

「モーニング娘。」を一つの語として解析している。neologd辞書すごいね。

Apache Solr で形態素解析する

  • Solr管理UIで起動したコアを選択して'Analysis'の画面へ遷移。

  • 'Field Value (index)'に上記と同じ文章を入力する。

  • 'Field Type'に'text_ja'を選択する。

  • 'Analysis Values'ボタンをポチッとな。

../_images/apache_solr_analysis.png

そもそも組み込まれている辞書が違うので同じ結果にはならない。

Solrのkuromoji(lucene/kuromoji)とatilika/kuromojiはなんやかや別物らしい。featureのメソッド名とかまるで違うぞ。

lucene/kuromojiの辞書をビルドしなおすのはどうすればええんだ?

ユーザ辞書の例があった。ユーザ辞書の作り方が独特だなぁ。めんどくさいねえ。

$ cat ./example/files/conf/lang/userdict_ja.txt
#
# This is a sample user dictionary for Kuromoji (JapaneseTokenizer)
#
# Add entries to this file in order to override the statistical model in terms
# of segmentation, readings and part-of-speech tags.  Notice that entries do
# not have weights since they are always used when found.  This is by-design
# in order to maximize ease-of-use.
#
# Entries are defined using the following CSV format:
#  <text>,<token 1> ... <token n>,<reading 1> ... <reading n>,<part-of-speech tag>
#
# Notice that a single half-width space separates tokens and readings, and
# that the number tokens and readings must match exactly.
#
# Also notice that multiple entries with the same <text> is undefined.
#
# Whitespace only lines are ignored.  Comments are not allowed on entry lines.
#

# Custom segmentation for kanji compounds
日本経済新聞,日本 経済 新聞,ニホン ケイザイ シンブン,カスタム名詞
関西国際空港,関西 国際 空港,カンサイ コクサイ クウコウ,カスタム名詞

# Custom segmentation for compound katakana
トートバッグ,トート バッグ,トート バッグ,かずカナ名詞
ショルダーバッグ,ショルダー バッグ,ショルダー バッグ,かずカナ名詞

# Custom reading for former sumo wrestler
朝青龍,朝青龍,アサショウリュウ,カスタム人名
$