Source code for janome.tokenizer

# Copyright 2015 moco_beta
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


"""
The tokenizer module supplies Token and Tokenizer classes.

Usage:

>>> from janome.tokenizer import Tokenizer
>>> t = Tokenizer()
>>> for token in t.tokenize(u'すもももももももものうち'):
...   print(token)
...
すもも	名詞,一般,*,*,*,*,すもも,スモモ,スモモ
も	助詞,係助詞,*,*,*,*,も,モ,モ
もも	名詞,一般,*,*,*,*,もも,モモ,モモ
も	助詞,係助詞,*,*,*,*,も,モ,モ
もも	名詞,一般,*,*,*,*,もも,モモ,モモ
の	助詞,連体化,*,*,*,*,の,ノ,ノ
うち	名詞,非自立,副詞可能,*,*,*,うち,ウチ,ウチ

with wakati ('分かち書き') mode:

>>> from janome.tokenizer import Tokenizer
>>> t = Tokenizer()
>>> for token in t.tokenize(u'すもももももももものうち', wakati=True):
...   print(token)
...
すもも

もも

もも

うち

with user dictionary (IPAdic format):

.. code-block:: shell

  $ cat examples/user_ipadic.csv
  東京スカイツリー,1288,1288,4569,名詞,固有名詞,一般,*,*,*,東京スカイツリー,トウキョウスカイツリー,トウキョウスカイツリー
  東武スカイツリーライン,1288,1288,4700,名詞,固有名詞,一般,*,*,*,東武スカイツリーライン,トウブスカイツリーライン,トウブスカイツリーライン
  とうきょうスカイツリー駅,1288,1288,4143,名詞,固有名詞,一般,*,*,*,とうきょうスカイツリー駅,トウキョウスカイツリーエキ,トウキョウスカイツリーエキ

>>> t = Tokenizer("user_ipadic.csv", udic_enc="utf8")
>>> for token in t.tokenize(u'東京スカイツリーへのお越しは、東武スカイツリーライン「とうきょうスカイツリー駅」が便利です。'):
...  print(token)...
...
東京スカイツリー	名詞,固有名詞,一般,*,*,*,東京スカイツリー,トウキョウスカイツリー,トウキョウスカイツリー
へ	助詞,格助詞,一般,*,*,*,へ,ヘ,エ
の	助詞,連体化,*,*,*,*,の,ノ,ノ
お越し	名詞,一般,*,*,*,*,お越し,オコシ,オコシ
は	助詞,係助詞,*,*,*,*,は,ハ,ワ
、	記号,読点,*,*,*,*,、,、,、
東武スカイツリーライン	名詞,固有名詞,一般,*,*,*,東武スカイツリーライン,トウブスカイツリーライン,トウブスカイツリーライン
「	記号,括弧開,*,*,*,*,「,「,「
とうきょうスカイツリー駅	名詞,固有名詞,一般,*,*,*,とうきょうスカイツリー駅,トウキョウスカイツリーエキ,トウキョウスカイツリーエキ
」	記号,括弧閉,*,*,*,*,」,」,」
が	助詞,格助詞,一般,*,*,*,が,ガ,ガ
便利	名詞,形容動詞語幹,*,*,*,*,便利,ベンリ,ベンリ
です	助動詞,*,*,*,特殊・デス,基本形,です,デス,デス
。	記号,句点,*,*,*,*,。,。,。

with user dictionary (simplified format):

.. code-block:: shell

  $ cat examples/user_simpledic.csv
  東京スカイツリー,カスタム名詞,トウキョウスカイツリー
  東武スカイツリーライン,カスタム名詞,トウブスカイツリーライン
  とうきょうスカイツリー駅,カスタム名詞,トウキョウスカイツリーエキ

>>> t = Tokenizer("user_simpledic.csv", udic_type="simpledic", udic_enc="utf8")
>>> for token in t.tokenize(u'東京スカイツリーへのお越しは、東武スカイツリーライン「とうきょうスカイツリー駅」が便利です。'):
...   print(token)

"""

import sys
import os
from typing import Iterator, Union, Tuple, Optional
from .lattice import Lattice, Node, SurfaceNode, BOS, EOS, NodeType  # type: ignore
from .dic import SystemDictionary, MMapSystemDictionary, UserDictionary, CompiledUserDictionary  # type: ignore

try:
    from janome.sysdic import all_fstdata, entries, mmap_entries, connections, chardef, unknowns  # type: ignore
except ImportError:
    # hack for unit testing...
    parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
    sys.path.insert(0, parent_dir)
    from sysdic import all_fstdata, entries, mmap_entries, connections, chardef, unknowns  # type: ignore


DEFAULT_MMAP_MODE = sys.maxsize > 2**32


[docs]class Token(object): """ A Token object contains all information for a token. """
[docs] def __init__(self, node: Node, extra: Tuple = None): self.node = node self.extra = extra
def __getattr__(self, name): if name == 'surface': return self.node.surface elif name == 'part_of_speech': return self.extra[0] if self.extra else self.node.part_of_speech elif name == 'infl_type': return self.extra[1] if self.extra else self.node.infl_type elif name == 'infl_form': return self.extra[2] if self.extra else self.node.infl_form elif name == 'base_form': return self.extra[3] if self.extra else self.node.base_form elif name == 'reading': return self.extra[4] if self.extra else self.node.reading elif name == 'phonetic': return self.extra[5] if self.extra else self.node.phonetic elif name == 'node_type': return self.node.node_type else: None def __str__(self): return f'{self.surface}\t' \ f'{self.part_of_speech},{self.infl_type},{self.infl_form},{self.base_form},{self.reading},{self.phonetic}'
[docs]class Tokenizer(object): """ A Tokenizer tokenizes Japanese texts with system and optional user defined dictionary. """ MAX_CHUNK_SIZE = 1024 CHUNK_SIZE = 500
[docs] def __init__(self, udic: str = '', *, udic_enc: str = 'utf8', udic_type: str = 'ipadic', max_unknown_length: int = 1024, wakati: bool = False, mmap: bool = DEFAULT_MMAP_MODE, dotfile: str = ''): """ Initialize Tokenizer object with optional arguments. :param udic: (Optional) user dictionary file (CSV format) or directory path to compiled dictionary data :param udic_enc: (Optional) character encoding for user dictionary. default is 'utf-8' :param udic_type: (Optional) user dictionray type. supported types are 'ipadic' and 'simpledic'. default is 'ipadic' :param max_unknows_length: (Optional) max unknown word length. default is 1024. :param wakati: (Optional) if given True load minimum sysdic data for 'wakati' mode. :param mmap: (Optional) if given False, memory-mapped file mode is disabled. Set this option to False on any environments that do not support mmap. Default is True on 64bit architecture; otherwise False. .. seealso:: http://mocobeta.github.io/janome/en/#use-with-user-defined-dictionary """ self.sys_dic: Union[SystemDictionary, MMapSystemDictionary] self.user_dic: Optional[Union[UserDictionary, CompiledUserDictionary]] self.wakati = wakati if mmap: self.sys_dic = MMapSystemDictionary(all_fstdata(), mmap_entries(wakati), connections, chardef.DATA, unknowns.DATA) else: self.sys_dic = SystemDictionary(all_fstdata(), entries(wakati), connections, chardef.DATA, unknowns.DATA) if udic: if udic.endswith('.csv'): # build user dictionary from CSV self.user_dic = UserDictionary(udic, udic_enc, udic_type, connections) elif os.path.isdir(udic): # load compiled user dictionary self.user_dic = CompiledUserDictionary(udic, connections) else: self.user_dic = None else: self.user_dic = None self.max_unknown_length = max_unknown_length
[docs] def tokenize(self, text: str, *, wakati: bool = False, baseform_unk: bool = True, dotfile: str = '') \ -> Iterator[Union[Token, str]]: """ Tokenize the input text. :param text: unicode string to be tokenized :param wakati: (Optinal) if given True returns surface forms only. default is False. :param baseform_unk: (Optional) if given True sets base_form attribute for unknown tokens. default is True. :param dotfile: (Optional) if specified, graphviz dot file is output to the path for later visualizing of the lattice graph. This option is ignored when the input length is larger than MAX_CHUNK_SIZE. :return: generator yielding tokens (wakati=False) or generator yielding string (wakati=True) """ if self.wakati: wakati = True if dotfile and len(text) < Tokenizer.MAX_CHUNK_SIZE: return self.__tokenize_stream(text, wakati, baseform_unk, dotfile) else: return self.__tokenize_stream(text, wakati, baseform_unk, '')
def __tokenize_stream(self, text, wakati, baseform_unk, dotfile): text = text.strip() text_length = len(text) processed = 0 while processed < text_length: tokens, pos = self.__tokenize_partial(text[processed:], wakati, baseform_unk, dotfile) for token in tokens: yield token processed += pos def __tokenize_partial(self, text, wakati, baseform_unk, dotfile): if self.wakati and not wakati: raise WakatiModeOnlyException chunk_size = min(len(text), Tokenizer.MAX_CHUNK_SIZE) lattice = Lattice(chunk_size, self.sys_dic) pos = 0 while not self.__should_split(text, pos): encoded_partial_text = text[pos:pos + min(50, chunk_size - pos)].encode('utf-8') # user dictionary if self.user_dic: entries = self.user_dic.lookup(encoded_partial_text) for e in entries: lattice.add(SurfaceNode(e, NodeType.USER_DICT)) matched = len(entries) > 0 # system dictionary entries = self.sys_dic.lookup(encoded_partial_text) for e in entries: lattice.add(SurfaceNode(e, NodeType.SYS_DICT)) matched = len(entries) > 0 # unknown cates = self.sys_dic.get_char_categories(text[pos]) if cates: for cate in cates: if matched and not self.sys_dic.unknown_invoked_always(cate): continue # unknown word length length = self.sys_dic.unknown_length(cate) \ if not self.sys_dic.unknown_grouping(cate) else self.max_unknown_length assert length >= 0 # buffer for unknown word buf = text[pos] for p in range(pos + 1, min(chunk_size, pos + length + 1)): _cates = self.sys_dic.get_char_categories(text[p]) if cate in _cates or any(cate in _compat_cates for _compat_cates in _cates.values()): buf += text[p] else: break unknown_entries = self.sys_dic.unknowns.get(cate) assert unknown_entries for entry in unknown_entries: left_id, right_id, cost, part_of_speech = entry base_form = buf if baseform_unk else '*' dummy_dict_entry = (buf, left_id, right_id, cost, part_of_speech, '*', '*', base_form, '*', '*') lattice.add(Node(dummy_dict_entry, NodeType.UNKNOWN)) pos += lattice.forward() lattice.end() min_cost_path = lattice.backward() assert isinstance(min_cost_path[0], BOS) assert isinstance(min_cost_path[-1], EOS) if wakati: tokens = [node.surface for node in min_cost_path[1:-1]] else: tokens = [] for node in min_cost_path[1:-1]: if type(node) == SurfaceNode and node.node_type == NodeType.SYS_DICT: tokens.append(Token(node, self.sys_dic.lookup_extra(node.num))) elif type(node) == SurfaceNode and node.node_type == NodeType.USER_DICT: tokens.append(Token(node, self.user_dic.lookup_extra(node.num))) else: tokens.append(Token(node)) if dotfile: lattice.generate_dotfile(filename=dotfile) return (tokens, pos) def __should_split(self, text, pos): return \ pos >= len(text) or \ pos >= Tokenizer.MAX_CHUNK_SIZE or \ (pos >= Tokenizer.CHUNK_SIZE and self.__splittable(text[:pos])) def __splittable(self, text): return self.__is_punct(text[-1]) or self.__is_newline(text) def __is_punct(self, c): return c == u'、' or c == u'。' or c == u',' or c == u'.' or c == u'?' or c == u'?' or c == u'!' or c == u'!' def __is_newline(self, text): return text.endswith('\n\n') or text.endswith('\r\n\r\n')
[docs]class WakatiModeOnlyException(Exception): pass