Source code for intent.utils.token

"""
Created on Mar 21, 2014

@author: rgeorgi
"""
from intent.consts import word_re, punc_re
from intent.utils.string_utils import string_compare_with_processing
import re

# ===============================================================================
# Main Token Class
# ===============================================================================

[docs]class Token(object): def __init__(self, content, **kwargs): self.content = content self.start = kwargs.get('start') self.stop = kwargs.get('stop') self.index = kwargs.get('index') self.attributes = {} self._parent = kwargs.get('parent') def __str__(self): return self.content def __repr__(self): return '<%s %s>' % (self.__class__.__name__, self.content) @property def parent(self): return self._parent
[docs] def lower(self): return str(self).lower()
@parent.setter def parent(self, v): self._parent = v @property def attrs(self): return self.attributes @attrs.getter def attrs(self): return self.attributes @property def seq(self): return self.content
[docs] def value(self): return self.content
def __eq__(self, o): return str(self) == str(o)
[docs] def morphs(self, **kwargs): for morph in self.morphed_tokens(): if kwargs.get('lowercase'): morph = Morph(morph.seq.lower(), morph.span, morph.parent) yield morph
[docs] def morphed_tokens(self): morphs = list(tokenize_string(self.seq, morpheme_tokenizer)) # If the tokenization yields no tokens, just return the string. if self.seq and len(morphs) == 0: yield Morph(self.seq, parent=self) for morph in morphs: yield (Morph.fromToken(morph, parent=self))
[docs] def morphequals(self, o, **kwargs): return string_compare_with_processing(self.seq, o.seq, **kwargs)
#=============================================================================== # POSToken #===============================================================================
[docs]class POSToken(Token): def __init__(self, content, **kwargs): super().__init__(content, **kwargs) if 'label' in kwargs: self.label = kwargs.get('label') def __repr__(self): return str(self) def __str__(self): return '<%s %s [%s]>' % (self.__class__.__name__, self.content, self.label) @property def label(self): return self.attributes.get('label') @label.setter def label(self, v): if v: self.attributes['label'] = v @classmethod
[docs] def fromToken(cls, t, **kwargs): return cls(t.seq, **kwargs)
[docs]class GoldTagPOSToken(Token): def __init__(self, content, **kwargs): Token.__init__(self, content, **kwargs) self.taglabel = kwargs.get('taglabel') self.goldlabel = kwargs.get('goldlabel') @classmethod
[docs] def fromToken(cls, t, taglabel=None, goldlabel=None): return cls(t.get_content(), taglabel=taglabel, goldlabel=goldlabel, index=t.index, start=t.start, stop=t.stop, parent=t._parent)
@property def taglabel(self): return self.attributes.get('taglabel') @taglabel.setter def taglabel(self, v): self.attributes['taglabel'] = v @property def goldlabel(self): return self.attributes.get('goldlabel') @goldlabel.setter def goldlabel(self, v): self.attributes['goldlabel'] = v
#=============================================================================== # Morph #===============================================================================
[docs]class Morph(Token): """ This class is what makes up an IGTToken. Should be comparable to a token """ def __init__(self, seq='', start=None, stop=None, parent=None): index = parent.index if parent else None Token.__init__(self, content=seq, start=start, stop=stop, index=index, parent=parent, tier=parent) @classmethod
[docs] def fromToken(cls, token, parent): return cls(token.seq, start=token.start, stop=token.stop, parent=parent)
def __str__(self): return '<Morph: %s>' % self.seq
#=============================================================================== # Tokenization Methods #===============================================================================
[docs]def whitespace_tokenizer(st): i = 1 for match in re.finditer('\S+', st, re.UNICODE): yield Token(match.group(0), start=match.start(), stop=match.end(), index=i) i += 1
[docs]def sentence_tokenizer(st): i = 1 for match in re.finditer('{}|{}+'.format(word_re, punc_re), st, flags=re.U): yield Token(match.group(0), start=match.start(), stop=match.end(), index=i) i += 1
[docs]def morpheme_tokenizer(st): """ Tokenize a string splitting it on typical morpheme boundaries: [ - . : = ( ) ] :param st: """ pieces = re.finditer('[^\s\-\.:/\(\)=]+', st) for match in pieces: if match.group().strip(): yield Morph(match.group(0), start=match.start(), stop=match.end())
[docs]def tag_tokenizer(st, delimeter='/'): for match in re.finditer('(\S+){}(\S+)'.format(delimeter), st, re.UNICODE): yield POSToken(match.group(1), label=match.group(2), start=match.start(), stop=match.end())
[docs]def tokenize_item(it, tokenizer=whitespace_tokenizer): """ :rtype : __generator[Token] """ tokens = tokenize_string(it.value(), tokenizer) return tokens
[docs]def tokenize_string(st, tokenizer=whitespace_tokenizer): """ :rtype : Tokenization """ tokens = Tokenization() iter = tokenizer(st) i = 0 for token in iter: token.index = i + 1 tokens.append(token) i += 1 return tokens
#=============================================================================== # Tokenization helper classes #===============================================================================
[docs]class Tokenization(list): """ Container class for a tokenization. """ def __init__(self, seq=list([]), original=''): self.original = original list.__init__(self, seq)
[docs] def text(self): return ' '.join([t.seq for t in self])
def __getitem__(self, item): """ :rtype : Token """ return super().__getitem__(item)
[docs] def slashtags(self, delimiter='/'): return ' '.join(['{}{}{}'.format(x.seq, delimiter, x.label) for x in self])
[docs]class Span(object): """ Just return a character span. """ def __init__(self, tup): """ Constructor """ self._start = tup[0] self._stop = tup[1] @property def start(self): return self._start @property def stop(self): return self._stop def __str__(self): return '(%s,%s)' % (self._start, self._stop) def __repr__(self): return str(self)
#=============================================================================== # Exceptions #===============================================================================
[docs]class TokenException(Exception): pass