Source code for compass.extraction.apply

"""Ordinance function to apply ordinance extraction on a document"""

import logging
from warnings import warn

from compass.llm import StructuredLLMCaller
from compass.extraction.date import DateExtractor
from compass.validation import (
    ParseChunksWithMemory,
    LegalTextValidator,
    parse_by_chunks,
)
from compass.warn import COMPASSWarning


logger = logging.getLogger(__name__)
# Multiplier used to consider text output from LLM to be hallucination
_TEXT_OUT_CHAR_BUFFER = 1.05


[docs] async def check_for_ordinance_info( doc, model_config, heuristic, ordinance_text_collector_class, permitted_use_text_collector_class=None, usage_tracker=None, ): """Parse a single document for ordinance information Parameters ---------- doc : elm.web.document.BaseDocument A document potentially containing ordinance information. Note that if the document's attrs contains the ``"contains_ord_info"`` key, it will not be processed. To force a document to be processed by this function, remove that key from the documents attrs. text_splitter : obj Instance of an object that implements a `split_text` method. The method should take text as input (str) and return a list of text chunks. Langchain's text splitters should work for this input. usage_tracker : compass.services.usage.UsageTracker, optional Optional tracker instance to monitor token usage during LLM calls. By default, ``None``. Returns ------- elm.web.document.BaseDocument Document that has been parsed for ordinance text. The results of the parsing are stored in the documents attrs. In particular, the attrs will contain a ``"contains_ord_info"`` key that will be set to ``True`` if ordinance info was found in the text, and ``False`` otherwise. If ``True``, the attrs will also contain a ``"date"`` key containing the most recent date that the ordinance was enacted (or a tuple of `None` if not found), and an ``"ordinance_text"`` key containing the ordinance text snippet. Note that the snippet may contain other info as well, but should encapsulate all of the ordinance text. """ if "contains_ord_info" in doc.attrs: return doc chunks = model_config.text_splitter.split_text(doc.text) chunk_parser = ParseChunksWithMemory(chunks, num_to_recall=2) legal_text_validator = LegalTextValidator( llm_service=model_config.llm_service, usage_tracker=usage_tracker, doc_is_from_ocr=doc.attrs.get("from_ocr", False), **model_config.llm_call_kwargs, ) ordinance_text_collector = ordinance_text_collector_class( llm_service=model_config.llm_service, usage_tracker=usage_tracker, **model_config.llm_call_kwargs, ) callbacks = [ordinance_text_collector.check_chunk] if permitted_use_text_collector_class is not None: permitted_use_text_collector = permitted_use_text_collector_class( llm_service=model_config.llm_service, usage_tracker=usage_tracker, **model_config.llm_call_kwargs, ) callbacks.append(permitted_use_text_collector.check_chunk) await parse_by_chunks( chunk_parser, heuristic, legal_text_validator, callbacks=callbacks, min_chunks_to_process=3, ) doc.attrs["contains_ord_info"] = ordinance_text_collector.contains_ord_info if doc.attrs["contains_ord_info"]: doc.attrs["ordinance_text"] = ordinance_text_collector.ordinance_text logger.debug_to_file( "Ordinance text for %s is:\n%s", doc.attrs.get("source", "unknown source"), doc.attrs["ordinance_text"], ) if permitted_use_text_collector_class is not None: doc.attrs["contains_district_info"] = ( permitted_use_text_collector.contains_district_info ) if doc.attrs["contains_district_info"]: doc.attrs["permitted_use_text"] = ( permitted_use_text_collector.permitted_use_district_text ) logger.debug_to_file( "Permitted use text for %s is:\n%s", doc.attrs.get("source", "unknown source"), doc.attrs["permitted_use_text"], ) return doc
[docs] async def extract_date(doc, model_config, usage_tracker=None): """Parse a single document for date information Parameters ---------- doc : elm.web.document.BaseDocument A document potentially containing date information. usage_tracker : compass.services.usage.UsageTracker, optional Optional tracker instance to monitor token usage during LLM calls. By default, ``None``. Returns ------- elm.web.document.BaseDocument Document that has been parsed for dates. The results of the parsing are stored in the documents attrs. In particular, the attrs will contain a ``"date"`` key that will contain the parsed date information. """ date_llm_caller = StructuredLLMCaller( llm_service=model_config.llm_service, usage_tracker=usage_tracker, **model_config.llm_call_kwargs, ) doc.attrs["date"] = await DateExtractor( date_llm_caller, model_config.text_splitter ).parse(doc) return doc
[docs] async def extract_ordinance_text_with_llm( doc, text_splitter, extractor, original_text_key ): """Extract ordinance text from document using LLM Parameters ---------- doc : elm.web.document.BaseDocument A document known to contain ordinance information. This means it must contain an ``"ordinance_text"`` key in the attrs. You can run :func:`~compass.extraction.apply.check_for_ordinance_info` to have this attribute populated automatically for documents that are found to contain ordinance data. Note that if the document's attrs does not contain the ``"ordinance_text"`` key, you will get an error. text_splitter : obj Instance of an object that implements a `split_text` method. The method should take text as input (str) and return a list of text chunks. Langchain's text splitters should work for this input. extractor : compass.extraction.ordinance.WindOrdinanceTextExtractor Instance of :class:`~compass.extraction.ordinance.WindOrdinanceTextExtractor` used for ordinance text extraction. original_text_key : str String corresponding to the `doc.attrs` key containing the original text (before extraction). Returns ------- elm.web.document.BaseDocument Document that has been parsed for ordinance text. The results of the extraction are stored in the document's attrs. str Key corresponding to the cleaned ordinance text stored in the `doc.attrs` dictionary. """ prev_meta_name = original_text_key # "ordinance_text" for meta_name, parser in extractor.parsers: doc.attrs[meta_name] = await _parse_if_input_text_not_empty( doc.attrs[prev_meta_name], text_splitter, parser, prev_meta_name, meta_name, ) prev_meta_name = meta_name return doc, prev_meta_name
[docs] async def extract_ordinance_text_with_ngram_validation( doc, text_splitter, extractor, original_text_key, n=4, num_extraction_attempts=3, ngram_fraction_threshold=0.9, ngram_ocr_fraction_threshold=0.75, ): """Extract ordinance text for a single document with known ord info This extraction includes an "ngram" check, which attempts to detect whether or not the cleaned text was extracted from the original ordinance text. The processing will attempt to re-extract the text if the validation does not pass a certain threshold until the maximum number of attempts is reached. If the text still does not pass validation at this point, there is a good chance that the LLM hallucinated parts of the output text, so caution should be taken. Parameters ---------- doc : elm.web.document.BaseDocument A document known to contain ordinance information. This means it must contain an ``"ordinance_text"`` key in the attrs. You can run :func:`~compass.extraction.apply.check_for_ordinance_info` to have this attribute populated automatically for documents that are found to contain ordinance data. Note that if the document's attrs does not contain the ``"ordinance_text"`` key, it will not be processed. text_splitter : obj Instance of an object that implements a `split_text` method. The method should take text as input (str) and return a list of text chunks. Langchain's text splitters should work for this input. original_text_key : str String corresponding to the `doc.attrs` key containing the original text (before extraction). n : int, optional Number of words to include per ngram for the ngram validation, which helps ensure that the LLM did not hallucinate. By default, ``4``. num_extraction_attempts : int, optional Number of extraction attempts before returning text that did not pass the ngram check. If the processing exceeds this value, there is a good chance that the LLM hallucinated parts of the output text. Cannot be negative or 0. By default, ``3``. ngram_fraction_threshold : float, optional Fraction of ngrams in the cleaned text that are also found in the original ordinance text (parsed using poppler) for the extraction to be considered successful. Should be a value between 0 and 1 (inclusive). By default, ``0.9``. ngram_ocr_fraction_threshold : float, optional Fraction of ngrams in the cleaned text that are also found in the original ordinance text (parsed using OCR) for the extraction to be considered successful. Should be a value between 0 and 1 (inclusive). By default, ``0.75``. Returns ------- elm.web.document.BaseDocument Document that has been parsed for ordinance text. The results of the extraction are stored in the document's attrs. """ if not doc.attrs.get(original_text_key): msg = ( f"Input document has no {original_text_key!r} key or string " "does not contain information. Please run " "`check_for_ordinance_info` prior to calling this method." ) warn(msg, COMPASSWarning) return doc return await _extract_with_ngram_check( doc, text_splitter, extractor, original_text_key, n=max(1, n), num_tries=max(1, num_extraction_attempts), ngram_fraction_threshold=max(0, min(1, ngram_fraction_threshold)), ngram_ocr_fraction_threshold=max( 0, min(1, ngram_ocr_fraction_threshold) ), )
async def _extract_with_ngram_check( doc, text_splitter, extractor, original_text_key, n=4, num_tries=3, ngram_fraction_threshold=0.9, ngram_ocr_fraction_threshold=0.75, ): """Extract ordinance info from doc and validate using ngrams.""" from compass.extraction.ngrams import sentence_ngram_containment # noqa source = doc.attrs.get("source", "Unknown") doc_is_from_ocr = doc.attrs.get("from_ocr", False) original_text = doc.attrs[original_text_key] if not original_text: msg = ( "Document missing original ordinance text! No extraction " "performed (Document source: %s)", source, ) warn(msg, COMPASSWarning) return doc ngram_thresh = ( ngram_ocr_fraction_threshold if doc_is_from_ocr else ngram_fraction_threshold ) best_score = 0 out_text_key = "extracted_text" for attempt in range(1, num_tries + 1): doc, out_text_key = await extract_ordinance_text_with_llm( doc, text_splitter, extractor, original_text_key ) cleaned_text = doc.attrs[out_text_key] if not cleaned_text: logger.debug( "No cleaned text found after extraction on attempt %d " "for document with source %s. Retrying...", attempt, source, ) continue ngram_frac = sentence_ngram_containment( original=original_text, test=cleaned_text, n=n ) if ngram_frac >= ngram_thresh: logger.debug( "Document extraction for %r passed ngram check on attempt %d " "with score %.2f (OCR: %r; Document source: %s)", out_text_key, attempt + 1, ngram_frac, doc_is_from_ocr, source, ) best_score = ngram_frac break best_score = max(best_score, ngram_frac) logger.debug( "Document extraction for %r failed ngram check on attempt %d " "with score %.2f (OCR: %r; Document source: %s). Retrying...", out_text_key, attempt + 1, ngram_frac, doc_is_from_ocr, source, ) else: msg = ( f"Ngram check failed after {num_tries} tries trying to extract " f"{original_text_key!r}. Not returning any extracted text due to " "high possibility of LLM hallucination! " f"(Best score: {best_score:.2f}; OCR: {doc_is_from_ocr}; " f"Document source: {source})" ) warn(msg, COMPASSWarning) return doc doc.attrs[f"{original_text_key}_ngram_score"] = best_score return doc
[docs] async def extract_ordinance_values(doc, parser, text_key, out_key): """Extract ordinance values for a single document Document must be known to contain ordinance text. Parameters ---------- doc : elm.web.document.BaseDocument A document known to contain ordinance text. This means it must contain an `text_key` key in the attrs. You can run :func:`~compass.extraction.apply.extract_ordinance_text_with_llm` to have this attribute populated automatically for documents that are found to contain ordinance data. Note that if the document's attrs does not contain the `text_key` key, it will not be processed. text_key : str Name of the key under which cleaned text is stored in `doc.attrs`. This text should be ready for extraction. out_key : str Name of the key under which extracted ordinances should be stored. Returns ------- elm.web.document.BaseDocument Document that has been parsed for ordinance values. The results of the extraction are stored in the document's attrs. """ if not doc.attrs.get(text_key): msg = ( f"Input document has no {text_key!r} key or string " "does not contain info. Please run " "`extract_ordinance_text_with_llm` prior to calling this method." ) warn(msg, COMPASSWarning) return doc doc.attrs[out_key] = await parser.parse(doc.attrs[text_key]) return doc
async def _parse_if_input_text_not_empty( text, text_splitter, parser, curr_text_name, next_text_name ): """Extract text using parser, or return empty if input empty""" if not text: msg = ( f"{curr_text_name!r} does not contain any text. Skipping " f"extraction for {next_text_name!r}" ) warn(msg, COMPASSWarning) return text text_chunks = text_splitter.split_text(text) extracted_text = await parser(text_chunks) if len(extracted_text) > _TEXT_OUT_CHAR_BUFFER * len(text): logger.debug( "LLM output more text than was given (IN: %d, OUT: %d). " "Throwing away response due to possible hallucination...", len(text), len(extracted_text), ) return "" logger.debug_to_file( "Extracted text for %r is:\n%s", next_text_name, extracted_text ) return extracted_text