Source code for compass.extraction.ngrams

"""Ordinance ngram text validation

This check helps validate that the LLM extracted text from the original
document and did not make it up itself.
"""

import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.util import ngrams


nltk.download("punkt", quiet=True)
nltk.download("punkt_tab", quiet=True)
nltk.download("stopwords", quiet=True)
STOP_WORDS = set(stopwords.words("english"))
PUNCTUATIONS = {'"', ".", "(", ")", ",", "?", ";", ":", "''", "``"}


def _check_word(word):
    """``True`` if a word is not a stop word or a punctuation"""
    return word not in STOP_WORDS and word not in PUNCTUATIONS


def _filtered_words(sentence):
    """Filter out common words and punctuations"""
    return [
        word.casefold()
        for word in word_tokenize(sentence)
        if _check_word(word.casefold())
    ]



[docs]
def convert_text_to_sentence_ngrams(text, n):
    """Convert input text to a list of ngrams

    The text is first split by sentence, after which each sentence is
    converted into ngrams. The ngrams for all sentences are combined and
    returned.

    Parameters
    ----------
    text : str
        Input text containing one or more sentences.
    n : int
        Number of words to include per ngram.

    Returns
    -------
    list
        List of tuples, where each tuple is an ngram from the original
        text.
    """
    all_ngrams = []
    sentences = sent_tokenize(text)
    for sentence in sentences:
        words = _filtered_words(sentence)
        all_ngrams += list(ngrams(words, n))
    return all_ngrams




[docs]
def sentence_ngram_containment(original, test, n):
    """Fraction of sentence ngrams from the text found in the original

    Parameters
    ----------
    original : str
        Original (superset) text. Ngrams from the `test` text will be
        checked against this text.
    test : str
        Test (sub) text. Ngrams from this text will be searched for in
        the original text, and the fraction of these ngrams that are
        found in the original text will be returned.
    n : int
        Number of words to include per ngram.

    Returns
    -------
    float
        Fraction of ngrams from the `test` input that were found in the
        `original` text. Always returns ``True`` if test has no ngrams.
    """
    ngrams_test = convert_text_to_sentence_ngrams(test, n)
    num_test_ngrams = len(ngrams_test)
    if not num_test_ngrams:
        return True

    ngrams_original = set(convert_text_to_sentence_ngrams(original, n))
    num_ngrams_found = sum(t in ngrams_original for t in ngrams_test)
    return num_ngrams_found / num_test_ngrams