easy_tokenizer package¶
Submodules¶
easy_tokenizer.patterns module¶
class to store knowledges used for tokenization
-
class
easy_tokenizer.patterns.
Patterns
¶ Bases:
object
Contains a set of special chars could be used for tokenization’
-
ABBREV_RE
= re.compile('(\\w\\.){2,}|(?:jan|feb|mar|apr|jun|jul|aug|sep|Sept|sept|SEPT|oct|nov|dec)\\.')¶
-
ALL_WEB_CAPTURED_RE
= re.compile('((?:(?:http[s]?|ftp)://|wwww?[.])(?:[a-zA-Z]|[0-9]|[-_:\\/?@.&+=]|(?:%[0-9a-fA-F][0-9a-fA-F]))+|(?:[0-9a-zA-Z][-\\w_]+)(?:\\.[0-9a-zA-Z][-\\w_]+){2,5}(?:(?:\\/(?:[0-9a-zA-Z]|[-_?.#=:&%])+)+)?\\/?|\\S)¶
-
ALL_WEB_RE
= re.compile('(?:(?:http[s]?|ftp)://|wwww?[.])(?:[a-zA-Z]|[0-9]|[-_:\\/?@.&+=]|(?:%[0-9a-fA-F][0-9a-fA-F]))+|(?:[0-9a-zA-Z][-\\w_]+)(?:\\.[0-9a-zA-Z][-\\w_]+){2,5}(?:(?:\\/(?:[0-9a-zA-Z]|[-_?.#=:&%])+)+)?\\/?|\\S+)¶
-
COMMON_HYPHEN_START
= ['e', 'i', 're', 'ex', 'self', 'fore', 'all', 'low', 'high']¶
-
DIGITS_CAPTURED_RE
= re.compile('((?:\\b|^)[-+±~]?(?:\\d[-.,0-9\\/#]*\\d|\\d+(?:st|nd|rd|th|[dD])?)[%]?(?:\\b|$))')¶
-
DIGITS_RE
= re.compile('(?:\\b|^)[-+±~]?(?:\\d[-.,0-9\\/#]*\\d|\\d+(?:st|nd|rd|th|[dD])?)[%]?(?:\\b|$)')¶
-
DIGIT_RE
= re.compile('\\d')¶
-
DOMAIN_RE
= re.compile('[@]\\S+[.]\\S+')¶
-
EMAIL_RE
= re.compile('\\S+[@]\\S+[.]\\S+')¶
-
HYPHEN_CAPTURED_RE
= re.compile('([\\-\\–\\—])')¶
-
HYPHEN_RE
= re.compile('[\\-\\–\\—]')¶
-
PARA_SEP_RE
= re.compile('(\\W|\\+\\-)\\1{4,}')¶
-
PUNCT_END_PHRASE
= frozenset({'’', ',', '"', ']', '»', '“', '”', ';', '?', ')', '[…]', ':', '!', '.', "'"})¶
-
PUNCT_SEQ_RE
= re.compile("[-!\\'#%&`()\\[\\]*+,.\\\\/:;<=>?@^$_{|}~]+")¶
-
URL_RE
= re.compile('(?:[0-9a-zA-Z][-\\w_]+)(?:\\.[0-9a-zA-Z][-\\w_]+){2,5}(?:(?:\\/(?:[0-9a-zA-Z]|[-_?.#=:&%])+)+)?\\/?|(?:(?:http[s]?|ftp)://|wwww?[.])(?:[a-zA-Z]|[0-9]|[-_:\\/?@.&+=]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')¶
-
WORD_BF_CAPTURED_RE
= re.compile('([()\\[\\]{}"“”\\\'`»:;,/\\\\*?!…<=>@^$\\|~%]|[\\u2022\\u2751\\uF000\\uF0FF]|[\\u25A0-\\u25FF]|\\.{2,})')¶
-
YEAR_RE
= re.compile('(?:\\b|^)(?:19|20)\\d\\d(?:\\b|$)')¶
-
static
abbreviation
(phrase)¶
-
all_web_captured_pn
= '((?:(?:http[s]?|ftp)://|wwww?[.])(?:[a-zA-Z]|[0-9]|[-_:\\/?@.&+=]|(?:%[0-9a-fA-F][0-9a-fA-F]))+|(?:[0-9a-zA-Z][-\\w_]+)(?:\\.[0-9a-zA-Z][-\\w_]+){2,5}(?:(?:\\/(?:[0-9a-zA-Z]|[-_?.#=:&%])+)+)?\\/?|\\S+[@]\\S+[.]\\S+|[@]\\S+[.]\\S+)'¶
-
all_web_pn
= '(?:(?:http[s]?|ftp)://|wwww?[.])(?:[a-zA-Z]|[0-9]|[-_:\\/?@.&+=]|(?:%[0-9a-fA-F][0-9a-fA-F]))+|(?:[0-9a-zA-Z][-\\w_]+)(?:\\.[0-9a-zA-Z][-\\w_]+){2,5}(?:(?:\\/(?:[0-9a-zA-Z]|[-_?.#=:&%])+)+)?\\/?|\\S+[@]\\S+[.]\\S+|[@]\\S+[.]\\S+'¶
-
digits_captured_pn
= '((?:\\b|^)[-+±~]?(?:\\d[-.,0-9\\/#]*\\d|\\d+(?:st|nd|rd|th|[dD])?)[%]?(?:\\b|$))'¶
-
digits_pn
= '(?:\\b|^)[-+±~]?(?:\\d[-.,0-9\\/#]*\\d|\\d+(?:st|nd|rd|th|[dD])?)[%]?(?:\\b|$)'¶
-
domain_pn
= '[@]\\S+[.]\\S+'¶
-
email_pn
= '\\S+[@]\\S+[.]\\S+'¶
-
hyphen_pn
= '[\\-\\–\\—]'¶
-
known_month_pn
= '(?:jan|feb|mar|apr|jun|jul|aug|sep|Sept|sept|SEPT|oct|nov|dec)\\.'¶
-
months
= ['jan', 'feb', 'mar', 'apr', 'jun', 'jul', 'aug', 'sep', 'Sept', 'sept', 'SEPT', 'oct', 'nov', 'dec']¶
-
repeat_abbrev_pn
= '(\\w\\.){2,}'¶
-
si_units
= ['m²', 'fm', 'cm²', 'm³', 'cm³', 'l', 'ltr', 'dl', 'cl', 'ml', '°C', '°F', 'K', 'g', 'gr', 'kg', 't', 'mg', 'μg', 'm', 'km', 'mm', 'μm', 'cm', 'sm', 's', 'ms', 'μs', 'Nm', 'klst', 'min', 'W', 'mW', 'kW', 'MW', 'GW', 'TW', 'J', 'kJ', 'MJ', 'GJ', 'TJ', 'kWh', 'MWh', 'kWst', 'MWst', 'kcal', 'cal', 'N', 'kN', 'V', 'v', 'mV', 'kV', 'A', 'mA', 'Hz', 'kHz', 'MHz', 'GHz', 'Pa', 'hPa', '°', '°c', '°f']¶
-
url_pn
= '(?:[0-9a-zA-Z][-\\w_]+)(?:\\.[0-9a-zA-Z][-\\w_]+){2,5}(?:(?:\\/(?:[0-9a-zA-Z]|[-_?.#=:&%])+)+)?\\/?'¶
-
url_strict_pn
= '(?:(?:http[s]?|ftp)://|wwww?[.])(?:[a-zA-Z]|[0-9]|[-_:\\/?@.&+=]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'¶
-
word_bf_captured_pn
= '([()\\[\\]{}"“”\\\'`»:;,/\\\\*?!…<=>@^$\\|~%]|[\\u2022\\u2751\\uF000\\uF0FF]|[\\u25A0-\\u25FF]|\\.{2,})'¶
-
word_bf_pn
= '[()\\[\\]{}"“”\\\'`»:;,/\\\\*?!…<=>@^$\\|~%]|[\\u2022\\u2751\\uF000\\uF0FF]|[\\u25A0-\\u25FF]|\\.{2,}'¶
-
easy_tokenizer.token_with_pos module¶
Class for tokens with position information
-
class
easy_tokenizer.token_with_pos.
TokenWithPos
(text, start, end)¶ Bases:
object
- TokenWithPos: token with start and end position in the text
- attributes: - text: text in the normalized form - start: start position - end: end position
easy_tokenizer.tokenizer module¶
Tokenizer Class
-
class
easy_tokenizer.tokenizer.
Tokenizer
(regexp=None)¶ Bases:
object
A basic Tokenizer class to tokenize strings and patterns
- Parameters:
- regexp: regexp used to tokenize the string
-
tokenize
(text)¶ - params:
- text: string
- pos_info: also output the position information when tokenizing
output: tokens (with position info)
-
tokenize_with_pos_info
(text)¶ tokenize
- params:
- text: string
- output:
- a list of Token object