1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21 """
22 This module contains functions for identifying languages based on language
23 models.
24 """
25
26 from os import extsep, path
27
28 from translate.misc.file_discovery import get_abs_data_filename
29 from translate.storage.base import TranslationStore
30 from translate.lang.ngram import NGram
31
32
34 MODEL_DIR = get_abs_data_filename('langmodels')
35 """The directory containing the ngram language model files."""
36 CONF_FILE = 'fpdb.conf'
37 """
38 The name of the file that contains language name-code pairs
39 (relative to C{MODEL_DIR}).
40 """
41
42 - def __init__(self, model_dir=None, conf_file=None):
43 if model_dir is None:
44 model_dir = self.MODEL_DIR
45 if not path.isdir(model_dir):
46 raise ValueError('Directory does not exist: %s' % (model_dir))
47
48 if conf_file is None:
49 conf_file = self.CONF_FILE
50 conf_file = path.abspath(path.join(model_dir, conf_file))
51 if not path.isfile(conf_file):
52 raise ValueError('File does not exist: %s' % (conf_file))
53
54 self._lang_codes = {}
55 self._load_config(conf_file)
56 self.ngram = NGram(model_dir)
57
59 """Load the mapping of language names to language codes as given in the
60 configuration file."""
61 lines = open(conf_file).read().splitlines()
62 for line in lines:
63 parts = line.split()
64 if not parts or line.startswith('#'):
65 continue
66 lname, lcode = parts[0], parts[1]
67
68 lname = path.split(lname)[-1]
69 if extsep in lname:
70 lname = lname[:lname.rindex(extsep)]
71
72
73 if lcode.endswith('-utf8'):
74 lcode = lcode[:-len('-utf8')]
75 if lcode.endswith('-') or lcode.endswith('_'):
76 lcode = lcode[:-1]
77
78 self._lang_codes[lname] = lcode
79
81 """Identify the language of the text in the given string."""
82 if not text:
83 return None
84 result = self.ngram.classify(text)
85 if result in self._lang_codes:
86 result = self._lang_codes[result]
87 return result
88
90 """Identify the source language of the given translation store or
91 units.
92
93 @type instore: C{TranslationStore} or list or tuple of
94 C{TranslationUnit}s.
95 @param instore: The translation store to extract source text from.
96 @returns: The identified language's code or C{None} if the language
97 could not be identified."""
98 if not isinstance(instore, (TranslationStore, list, tuple)):
99 return None
100
101 text = u' '.join(unit.source for unit in instore[:50] if unit.istranslatable() and unit.source)
102 if not text:
103 return None
104 return self.identify_lang(text)
105
107 """Identify the target language of the given translation store or
108 units.
109
110 @type instore: C{TranslationStore} or list or tuple of
111 C{TranslationUnit}s.
112 @param instore: The translation store to extract target text from.
113 @returns: The identified language's code or C{None} if the language
114 could not be identified."""
115 if not isinstance(instore, (TranslationStore, list, tuple)):
116 return None
117
118 text = u' '.join(unit.target for unit in instore[:200] if unit.istranslatable() and unit.target)
119 if not text:
120 return None
121 return self.identify_lang(text)
122
123 if __name__ == "__main__":
124 from sys import argv
125 script_dir = path.abspath(path.dirname(argv[0]))
126 identifier = LanguageIdentifier(path.join(script_dir, '..', 'share', 'langmodels'))
127 import locale
128 encoding = locale.getpreferredencoding()
129 print "Language detected:", identifier.identify_lang(argv[1].decode(encoding))
130