Source code for compass.extraction.ngrams

"""Ordinance ngram text validation

This check helps validate that the LLM extracted text from the original
document and did not make it up itself.
"""

import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.util import ngrams


nltk.download("punkt", quiet=True)
nltk.download("punkt_tab", quiet=True)
nltk.download("stopwords", quiet=True)
STOP_WORDS = set(stopwords.words("english"))
PUNCTUATIONS = {'"', ".", "(", ")", ",", "?", ";", ":", "''", "``"}


def _check_word(word):
    """``True`` if a word is not a stop word or a punctuation"""
    return word not in STOP_WORDS and word not in PUNCTUATIONS


def _filtered_words(sentence):
    """Filter out common words and punctuations"""
    return [
        word.casefold()
        for word in word_tokenize(sentence)
        if _check_word(word.casefold())
    ]


[docs] def convert_text_to_sentence_ngrams(text, n): """Convert input text to a list of ngrams The text is first split by sentence, after which each sentence is converted into ngrams. The ngrams for all sentences are combined and returned. Parameters ---------- text : str Input text containing one or more sentences. n : int Number of words to include per ngram. Returns ------- list List of tuples, where each tuple is an ngram from the original text. """ all_ngrams = [] sentences = sent_tokenize(text) for sentence in sentences: words = _filtered_words(sentence) all_ngrams += list(ngrams(words, n)) return all_ngrams
[docs] def sentence_ngram_containment(original, test, n): """Fraction of sentence ngrams from the text found in the original Parameters ---------- original : str Original (superset) text. Ngrams from the `test` text will be checked against this text. test : str Test (sub) text. Ngrams from this text will be searched for in the original text, and the fraction of these ngrams that are found in the original text will be returned. n : int Number of words to include per ngram. Returns ------- float Fraction of ngrams from the `test` input that were found in the `original` text. Always returns ``True`` if test has no ngrams. """ ngrams_test = convert_text_to_sentence_ngrams(test, n) num_test_ngrams = len(ngrams_test) if not num_test_ngrams: return True ngrams_original = set(convert_text_to_sentence_ngrams(original, n)) num_ngrams_found = sum(t in ngrams_original for t in ngrams_test) return num_ngrams_found / num_test_ngrams