Chinese, Japanese and Korean (CJK) languages

Manticore provides built-in support for indexing CJK texts, allowing you to process CJK texts in two different ways:

  1. Precise segmentation using the ICU library. Currently, only Chinese is supported.
SQL:
CREATE TABLE products(title text, price float) charset_table = 'cjk' morphology = 'icu_chinese'
POST /cli -d "
CREATE TABLE products(title text, price float) charset_table = 'cjk' morphology = 'icu_chinese'"
$index = new \Manticoresearch\Index($client);
$index->setName('products');
$index->create([
            'title'=>['type'=>'text'],
            'price'=>['type'=>'float']
        ],[
            'charset_table' => 'cjk',
            'morphology' => 'icu_chinese'
        ]);
Python:
utilsApi.sql('CREATE TABLE products(title text, price float) charset_table = \'cjk\' morphology = \'icu_chinese\'')
Javascript:
res = await utilsApi.sql('CREATE TABLE products(title text, price float) charset_table = \'cjk\' morphology = \'icu_chinese\'');
Java:
utilsApi.sql("CREATE TABLE products(title text, price float) charset_table = 'cjk' morphology = 'icu_chinese'");
table products {
  charset_table = cjk
  morphology = icu_chinese
  
  type = rt
  path = tbl
  rt_field = title
  rt_attr_uint = price
}
  1. Basic support using the N-gram options ngram_len and ngram_chars For each CJK language, there are separate character set tables (chinese, korean, japanese) that can be used, or you can use the common cjk character set table.
SQL:
CREATE TABLE products(title text, price float) charset_table = 'non_cjk' ngram_len = '1' ngram_chars = 'cjk'
POST /cli -d "
CREATE TABLE products(title text, price float) charset_table = 'non_cjk' ngram_len = '1' ngram_chars = 'cjk'"
$index = new \Manticoresearch\Index($client);
$index->setName('products');
$index->create([
            'title'=>['type'=>'text'],
            'price'=>['type'=>'float']
        ],[
             'charset_table' => 'non_cjk',
             'ngram_len' => '1',
             'ngram_chars' => 'cjk'
        ]);
Python:
utilsApi.sql('CREATE TABLE products(title text, price float) charset_table = \'non_cjk\' ngram_len = \'1\' ngram_chars = \'cjk\'')
javascript:
res = await utilsApi.sql('CREATE TABLE products(title text, price float) charset_table = \'non_cjk\' ngram_len = \'1\' ngram_chars = \'cjk\'');
java:
utilsApi.sql("CREATE TABLE products(title text, price float) charset_table = 'non_cjk' ngram_len = '1' ngram_chars = 'cjk'");
table products {
  charset_table = non_cjk
  ngram_len = 1
  ngram_chars = cjk

  type = rt
  path = tbl
  rt_field = title
  rt_attr_uint = price
}

Additionally, there is built-in support for Chinese stopwords with the alias zh.

SQL:
CREATE TABLE products(title text, price float) charset_table = 'chinese' morphology = 'icu_chinese' stopwords = 'zh'
POST /cli -d "
CREATE TABLE products(title text, price float) charset_table = 'chinese' morphology = 'icu_chinese' stopwords = 'zh'"
PHP:
$index = new \Manticoresearch\Index($client);
$index->setName('products');
$index->create([
            'title'=>['type'=>'text'],
            'price'=>['type'=>'float']
        ],[
            'charset_table' => 'chinese',
            'morphology' => 'icu_chinese',
            'stopwords' => 'zh'
        ]);
Python:
utilsApi.sql('CREATE TABLE products(title text, price float) charset_table = \'chinese\' morphology = \'icu_chinese\' stopwords = \'zh\'')
javascript:
res = await utilsApi.sql('CREATE TABLE products(title text, price float) charset_table = \'chinese\' morphology = \'icu_chinese\' stopwords = \'zh\'');
java:
utilsApi.sql("CREATE TABLE products(title text, price float) charset_table = 'chinese' morphology = 'icu_chinese' stopwords = 'zh'");
table products {
  charset_table = chinese
  morphology = icu_chinese
  stopwords = zh
  
  type = rt
  path = tbl
  rt_field = title
  rt_attr_uint = price
}