Source code for janome.dic

# Copyright 2015 moco_beta
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from abc import ABC, abstractmethod
import os
import io
import pickle
import gzip
from struct import pack, unpack
import traceback
import logging
import sys
import re
import pkgutil
import zlib
import base64
from functools import lru_cache
from .fst import Matcher, create_minimum_transducer, compileFST

logger = logging.getLogger(__name__)
logger.setLevel(logging.WARN)
handler = logging.StreamHandler()
handler.setLevel(logging.WARN)
formatter = logging.Formatter('%(asctime)s\t%(name)s - %(levelname)s\t%(message)s')
handler.setFormatter(formatter)
logger.addHandler(handler)

MODULE_FST_DATA = 'fst_data%d.py'
MODULE_ENTRIES_EXTRA = 'entries_extra%d.py'
MODULE_ENTRIES_COMPACT = 'entries_compact%d.py'
MODULE_ENTRIES_BUCKETS = 'entries_buckets.py'
MODULE_CONNECTIONS = 'connections%d.py'
MODULE_CHARDEFS = 'chardef.py'
MODULE_UNKNOWNS = 'unknowns.py'

FILE_USER_FST_DATA = 'user_fst.data'
FILE_USER_ENTRIES_DATA = 'user_entries.data'


[docs]def save_fstdata(data, dir, part=0):
    _save_as_module(os.path.join(dir, MODULE_FST_DATA % part), data, binary=True)


[docs]def start_save_entries(dir, bucket_idx, morph_offset):
    _start_entries_as_module(os.path.join(dir, MODULE_ENTRIES_COMPACT % bucket_idx), morph_offset)
    _start_entries_as_module(os.path.join(dir, MODULE_ENTRIES_EXTRA % bucket_idx), morph_offset)


[docs]def end_save_entries(dir, bucket_idx):
    _end_entries_as_module(os.path.join(dir, MODULE_ENTRIES_COMPACT % bucket_idx))
    _end_entries_as_module(os.path.join(dir, MODULE_ENTRIES_EXTRA % bucket_idx))


[docs]def save_entry(dir, bucket_idx, morph_id, entry):
    _save_entry_as_module_compact(os.path.join(dir, MODULE_ENTRIES_COMPACT % bucket_idx), morph_id, entry)
    _save_entry_as_module_extra(os.path.join(dir, MODULE_ENTRIES_EXTRA % bucket_idx), morph_id, entry)


[docs]def save_entry_buckets(dir, buckets):
    _save_as_module(os.path.join(dir, MODULE_ENTRIES_BUCKETS), buckets)


[docs]def save_connections(connections, dir='.'):
    # split whole connections to 2 buckets to reduce memory usage while installing.
    # TODO: find better ways...
    bucket_size = (len(connections) // 2) + 1
    offset = 0
    for i in range(1, 3):
        _save_as_module(os.path.join(dir, MODULE_CONNECTIONS % i),
                        connections[offset:offset + bucket_size])
        offset += bucket_size


[docs]def save_chardefs(chardefs, dir='.'):
    _save_as_module(os.path.join(dir, MODULE_CHARDEFS), chardefs)


[docs]def save_unknowns(unknowns, dir='.'):
    _save_as_module(os.path.join(dir, MODULE_UNKNOWNS), unknowns)


def _save(file, data, compresslevel):
    if not data:
        return
    with gzip.open(file, 'wb', compresslevel) as f:
        f.write(data)
        f.flush()


def _load(file):
    if not os.path.exists(file):
        return None
    with gzip.open(file, 'rb') as f:
        data = f.read()
        return data


def _load_package_data(package, resource):
    try:
        rawdata = pkgutil.get_data(package, resource)
    except IOError:
        return None
    return zlib.decompress(rawdata, zlib.MAX_WBITS | 16)


def _save_as_module(file, data, binary=False):
    if not data:
        return
    with open(file, 'w') as f:
        f.write('DATA=')
        if binary:
            f.write('"')
            f.write(base64.b64encode(data).decode('ascii'))
            f.write('"')
        else:
            f.write(str(data).replace('\\\\', '\\'))
        f.flush()


def _start_entries_as_module(file, morph_id_offset):
    idx_file = re.sub(r'\.py$', '_idx.py', file)
    with open(file, 'w') as f:
        with open(idx_file, 'w') as f_idx:
            f.write('DATA={')
            f_idx.write('DATA={')
            f_idx.write(f'"offset": {morph_id_offset}, "positions": [')


def _end_entries_as_module(file):
    idx_file = re.sub(r'\.py$', '_idx.py', file)
    with open(file, 'a') as f:
        with open(idx_file, 'a') as f_idx:
            f.write('}\n')
            f_idx.write(']}\n')
            f.flush()
            f_idx.flush()


def _save_entry_as_module_compact(file, morph_id, entry):
    idx_file = re.sub(r'\.py$', '_idx.py', file)
    with open(file, 'a') as f:
        with open(idx_file, 'a') as f_idx:
            f.write('%d:(' % morph_id)
            pos = f.tell()
            f_idx.write(f'{pos},')
            s = u"u'%s',%4d,%4d,%5d" % (
                entry[0].encode('unicode_escape').decode('ascii'),
                entry[1],
                entry[2],
                entry[3])
            f.write(s)
            f.write('),')


def _save_entry_as_module_extra(file, morph_id, entry):
    idx_file = re.sub(r'\.py$', '_idx.py', file)
    with open(file, 'a') as f:
        with open(idx_file, 'a') as f_idx:
            f.write('%d:(' % morph_id)
            pos = f.tell()
            f_idx.write(f'{pos},')
            s = u"u'%s',u'%s',u'%s',u'%s',u'%s',u'%s'" % (
                entry[4].encode('unicode_escape').decode('ascii'),
                entry[5].encode('unicode_escape').decode('ascii'),
                entry[6].encode('unicode_escape').decode('ascii'),
                entry[7].encode('unicode_escape').decode('ascii'),
                entry[8].encode('unicode_escape').decode('ascii'),
                entry[9].encode('unicode_escape').decode('ascii'))
            f.write(s)
            f.write('),')


[docs]class Dictionary(ABC):
    """
    Base dictionary class
    """

[docs]    @abstractmethod
    def lookup(self, s, matcher):
        pass

[docs]    @abstractmethod
    def lookup_extra(self, num):
        pass

[docs]    @abstractmethod
    def get_trans_cost(self, id1, id2):
        pass


[docs]class RAMDictionary(Dictionary):
    """
    RAM dictionary class
    """

[docs]    def __init__(self, entries, connections):
        self.entries = entries
        self.connections = connections

[docs]    def lookup(self, s, matcher):
        (matched, outputs) = matcher.run(s)
        if not matched:
            return []
        try:
            res = []
            for e in outputs:
                num = unpack('I', e)[0]
                res.append((num,) + self.entries[num][:4])
            return res
        except Exception:
            logger.error('Cannot load dictionary data. The dictionary may be corrupted?')
            logger.error(f'input={s}')
            logger.error(f'outputs={str(outputs)}')
            traceback.format_exc()
            sys.exit(1)

[docs]    def lookup_extra(self, num):
        try:
            return self.entries[num][4:]
        except Exception:
            logger.error('Cannot load dictionary data. The dictionary may be corrupted?')
            traceback.format_exc()
            sys.exit(1)

[docs]    def get_trans_cost(self, id1, id2):
        return self.connections[id1][id2]


[docs]class MMapDictionary(Dictionary):
    """
    MMap dictionary class
    """

[docs]    def __init__(self, entries_compact, entries_extra, open_files, connections):
        self.entries_compact = entries_compact
        self.bucket_ranges = entries_compact.keys()
        self.entries_extra = entries_extra
        self.open_files = open_files
        self.connections = connections

[docs]    def lookup(self, s, matcher):
        (matched, outputs) = matcher.run(s)
        if not matched:
            return []
        try:
            matched_entries = []
            for e in outputs:
                idx = unpack('I', e)[0]
                matched_entries.append((idx,) + self._find_entry(idx))
            return matched_entries
        except Exception:
            logger.error('Cannot load dictionary data. The dictionary may be corrupted?')
            logger.error(f'input={s}')
            logger.error(f'outputs={str(outputs)}')
            traceback.format_exc()
            sys.exit(1)

    @lru_cache(maxsize=8192)
    def _find_entry(self, idx):
        bucket = next(filter(lambda b: idx >= b[0] and idx < b[1], self.bucket_ranges))
        mm, mm_idx = self.entries_compact[bucket]
        rel_idx = idx - mm_idx['offset']
        _pos1s = mm_idx['positions'][rel_idx] + 2
        _pos1e = mm.find(b"',", _pos1s)
        _pos2s = _pos1e + 2
        _pos2e = _pos2s + 4
        _pos3s = _pos2e + 1
        _pos3e = _pos3s + 4
        _pos4s = _pos3e + 1
        _pos4e = _pos4s + 5
        _entry = (
            mm[_pos1s:_pos1e].decode('unicode_escape'),
            int(mm[_pos2s:_pos2e]),
            int(mm[_pos3s:_pos3e]),
            int(mm[_pos4s:_pos4e]))
        return _entry

[docs]    @lru_cache(maxsize=1024)
    def lookup_extra(self, idx):
        try:
            bucket = next(filter(lambda b: idx >= b[0] and idx < b[1], self.bucket_ranges))
            mm, mm_idx = self.entries_extra[bucket]
            rel_idx = idx - mm_idx['offset']
            _pos1s = mm_idx['positions'][rel_idx] + 2
            _pos1e = mm.find(b"',u'", _pos1s)
            _pos2s = _pos1e + 4
            _pos2e = mm.find(b"',u'", _pos2s)
            _pos3s = _pos2e + 4
            _pos3e = mm.find(b"',u'", _pos3s)
            _pos4s = _pos3e + 4
            _pos4e = mm.find(b"',u'", _pos4s)
            _pos5s = _pos4e + 4
            _pos5e = mm.find(b"',u'", _pos5s)
            _pos6s = _pos5e + 4
            _pos6e = mm.find(b"')", _pos6s)
            return (
                mm[_pos1s:_pos1e].decode('unicode_escape'), mm[_pos2s:_pos2e].decode(
                    'unicode_escape'), mm[_pos3s:_pos3e].decode('unicode_escape'),
                mm[_pos4s:_pos4e].decode('unicode_escape'), mm[_pos5s:_pos5e].decode(
                    'unicode_escape'), mm[_pos6s:_pos6e].decode('unicode_escape')
            )
        except Exception:
            logger.error('Cannot load extra info. The dictionary may be corrupted?')
            logger.error(f'idx={idx}')
            traceback.format_exc()
            sys.exit(1)

[docs]    def get_trans_cost(self, id1, id2):
        return self.connections[id1][id2]

    def __del__(self):
        for mm, mm_idx in self.entries_compact.values():
            mm.close()
        if self.entries_extra:
            for mm, mm_idx in self.entries_extra.values():
                mm.close()
        for fp in self.open_files:
            fp.close()


[docs]class UnknownsDictionary(object):
    """
    Dictionary class for handling unknown words
    """

[docs]    def __init__(self, chardefs, unknowns):
        self.char_categories = chardefs[0]
        self.char_ranges = chardefs[1]
        self.unknowns = unknowns

[docs]    @lru_cache(maxsize=1024)
    def get_char_categories(self, c):
        res = {}
        for chr_range in self.char_ranges:
            if chr_range['from'] <= c <= chr_range['to']:
                cate = chr_range['cate']
                compate_cates = chr_range['compat_cates'] if 'compat_cates' in chr_range else []
                res[cate] = compate_cates
        if not res:
            res = {'DEFAULT': []}
        return res

[docs]    def unknown_invoked_always(self, cate):
        if cate in self.char_categories:
            return self.char_categories[cate]['INVOKE']
        return False

[docs]    def unknown_grouping(self, cate):
        if cate in self.char_categories:
            return self.char_categories[cate]['GROUP']
        return False

[docs]    def unknown_length(self, cate):
        if cate in self.char_categories:
            return self.char_categories[cate]['LENGTH']
        return -1


[docs]class UserDictionary(RAMDictionary):
    """
    User dictionary class (on-the-fly)
    """

[docs]    def __init__(self, user_dict, enc, type, connections, progress_handler=None):
        """
        Initialize user defined dictionary object.

        :param user_dict: user dictionary file (CSV format)
        :param enc: character encoding
        :param type: user dictionary type. supported types are 'ipadic' and 'simpledic'
        :param connections: connection cost matrix. expected value is SYS_DIC.connections
        :param progress_handler: handler mainly to indicate progress, implementation of ProgressHandler

        .. seealso:: http://mocobeta.github.io/janome/en/#use-with-user-defined-dictionary
        """
        fst_data, entries = UserDictionary.build_dic(user_dict, enc, type, progress_handler)
        super().__init__(entries, connections)
        self.compiledFST = [fst_data]
        self.matcher = Matcher([fst_data])

[docs]    def lookup(self, s):
        return super().lookup(s, self.matcher)

[docs]    @classmethod
    def line_to_entry_ipadic(cls, line):
        """Convert IPADIC formatted string to an user dictionary entry"""
        surface, left_id, right_id, cost, \
            pos_major, pos_minor1, pos_minor2, pos_minor3, \
            infl_type, infl_form, base_form, reading, phonetic = line.split(',')
        part_of_speech = ','.join([pos_major, pos_minor1, pos_minor2, pos_minor3])
        return (surface, int(left_id), int(right_id), int(cost),
                part_of_speech, infl_type, infl_form, base_form, reading, phonetic)

[docs]    @classmethod
    def line_to_entry_simpledic(cls, line):
        """Convert simpledict formatted string to an user dictionary entry"""
        surface, pos_major, reading = line.split(',')
        part_of_speech = ','.join([pos_major, '*', '*', '*'])
        return (surface, 0, 0, -100000, part_of_speech, '*', '*', surface, reading, reading)

[docs]    @classmethod
    def build_dic(cls, user_dict, enc, dict_type, progress_handler):
        surfaces = []
        entries = {}

        line_to_entry = getattr(cls, 'line_to_entry_' + dict_type)
        # init progress for reading CSV
        if progress_handler:
            with open(user_dict, encoding=enc) as f:
                progress_handler.on_start(
                    total=sum(1 for line in f),
                    desc='Reading user dictionary from CSV')

        with io.open(user_dict, encoding=enc) as f:
            for line in f:
                line = line.rstrip()
                # entry should be a tuple:
                # (surface, left_id, right_id, cost, part_of_speech, infl_type, infl_form, base_form, reading, phonetic)
                entry = line_to_entry(line)
                morph_id = len(surfaces)
                surfaces.append((entry[0].encode('utf8'), pack('I', morph_id)))
                entries[morph_id] = entry

                # update progress
                if progress_handler:
                    progress_handler.on_progress()

        # complete progress for reading CSV
        if progress_handler:
            progress_handler.on_complete()

        inputs = sorted(surfaces)  # inputs must be sorted.
        assert len(surfaces) == len(entries)

        # init progress for create_minimum_transducer
        if progress_handler:
            progress_handler.on_start(
                total=len(inputs),
                desc='Running create_minimum_transducer')

        processed, fst = create_minimum_transducer(
            inputs,
            on_progress=progress_handler.on_progress if progress_handler else None)

        # complete progress for create_minimum_transducer
        if progress_handler:
            progress_handler.on_complete()

        compiledFST = compileFST(fst)
        return compiledFST, entries

[docs]    def save(self, to_dir, compressionlevel=9):
        """
        Save compressed compiled dictionary data.

        :param to_dir: directory to save dictionary data
        :compressionlevel: (Optional) gzip compression level. default is 9
        """
        if os.path.exists(to_dir) and not os.path.isdir(to_dir):
            raise Exception(f'Not a directory : {to_dir}')
        elif not os.path.exists(to_dir):
            os.makedirs(to_dir, mode=int('0755', 8))
        _save(os.path.join(to_dir, FILE_USER_FST_DATA), self.compiledFST[0], compressionlevel)
        _save(os.path.join(to_dir, FILE_USER_ENTRIES_DATA), pickle.dumps(self.entries), compressionlevel)


[docs]class CompiledUserDictionary(RAMDictionary):
    """
    User dictionary class (compiled)
    """

[docs]    def __init__(self, dic_dir, connections):
        fst_data, entries = CompiledUserDictionary.load_dict(dic_dir)
        super().__init__(entries, connections)
        self.matcher = Matcher([fst_data])

[docs]    def lookup(self, s):
        return super().lookup(s, self.matcher)

[docs]    @classmethod
    def load_dict(cls, dic_dir):
        if not os.path.exists(dic_dir) or not os.path.isdir(dic_dir):
            raise Exception(f'No such directory : {dic_dir}')
        data = _load(os.path.join(dic_dir, FILE_USER_FST_DATA))
        entries = pickle.loads(_load(os.path.join(dic_dir, FILE_USER_ENTRIES_DATA)))
        return data, entries


[docs]class LoadingDictionaryError(Exception):
[docs]    def __init__(self):
        self.message = 'Cannot load dictionary data. Try mmap mode for very large dictionary.'