Finding the most frequent tag

Now, just to compare the performance of our HMM model, let's build a most frequent class tagger (MFC Tagger). We start by defining a function to count the pairs of tags and words:

def pair_counts(tags, words):
d = defaultdict(lambda: defaultdict(int))
for tag, word in zip(tags, words):
d[tag][word] += 1

return d
tags = [tag for i, (word, tag) in enumerate(data.training_set.stream())]
words = [word for i, (word, tag) in enumerate(data.training_set.stream())]

Now, let's define theĀ MFCTagger class:

FakeState = namedtuple('FakeState', 'name')

class MFCTagger:
missing = FakeState(name = '<MISSING>')

def __init__(self, table):
self.table = defaultdict(lambda: MFCTagger.missing)
self.table.update({word: FakeState(name=tag) for word, tag in table.items()})

def viterbi(self, seq):
"""This method simplifies predictions by matching the Pomegranate viterbi() interface"""
return 0., list(enumerate(["<start>"] + [self.table[w] for w in seq] + ["<end>"]))

tags = [tag for i, (word, tag) in enumerate(data.training_set.stream())]
words = [word for i, (word, tag) in enumerate(data.training_set.stream())]

word_counts = pair_counts(words, tags)
mfc_table = dict((word, max(tags.keys(), key=lambda key: tags[key])) for word, tags in word_counts.items())

mfc_model = MFCTagger(mfc_table)

Here are some helper functions to make predictions from the model:

def replace_unknown(sequence):
return [w if w in data.training_set.vocab else 'nan' for w in sequence]

def simplify_decoding(X, model):
_, state_path = model.viterbi(replace_unknown(X))
return [state[1].name for state in state_path[1:-1]]
>>> for key in data.testing_set.keys[:2]:
... print("Sentence Key: {}\n".format(key))
... print("Predicted labels:\n-----------------")
... print(simplify_decoding(data.sentences[key].words, mfc_model))
... print()
... print("Actual labels:\n--------------")
... print(data.sentences[key].tags)
... print("\n")