Class 11: Code Profiling#

In this class we will use code profiling to investigate problems with parallelization of NLP pipelines.

Spacy NLP Pipeline#

Run the following snippet, comparing n_process=1 versus n_process=4.

import spacy
from spacy.tokens import Doc
from nltk.tokenize import TweetTokenizer

class MyTokenizer:

    def __init__(self, vocab):
        self.vocab = vocab
        self.rgx = TweetTokenizer().WORD_RE

    def __call__(self, text):
        words = self.rgx.findall(text)
        return Doc(self.vocab, words=words)

nlp = spacy.blank("en")
nlp.tokenizer = MyTokenizer(nlp.vocab)
doc = nlp("What's happened to me? he thought. It wasn't a dream.")
print([token.text for token in doc])

outs = ["What's happened to me? he thought. It wasn't a dream."]*100000

docs = nlp.pipe(outs,n_process=1,batch_size=10000)
docs = list(docs)

Profilling tokenization#

Use profiller to debug and understand the issue!

import cProfile, pstats, io
from pstats import SortKey
import spacy

class MyTokenizer:

    def __init__(self, vocab):
        self.vocab = vocab
        self.rgx = TweetTokenizer().WORD_RE

    def __call__(self, text):
        words = self.rgx.findall(text)
        return Doc(self.vocab, words=words)

nlp = spacy.blank("en")
nlp.tokenizer = MyTokenizer(nlp.vocab)
texts = ["What's happened to me? he thought. It wasn't a dream."]*100000

def test_fn():
  docs = nlp.pipe(texts,n_process=2,batch_size=1000)
  docs = list(docs)


with cProfile.Profile() as pr:
  test_fn() # do something
  s = io.StringIO()
  sortby = SortKey.CUMULATIVE


stats = pstats.Stats(pr)
stats.sort_stats(pstats.SortKey.CUMULATIVE)
stats.print_stats()

Note

The code is also availabe as a gist.