Source code for compass.extraction.date
"""Ordinance date extraction logic"""
import logging
from datetime import datetime
from collections import Counter
from compass.utilities.enums import LLMUsageCategory
logger = logging.getLogger(__name__)
# These domains contain the collection date in URL, not enactment date
_BANNED_DATE_DOMAINS = ["https://energyzoning.org"]
[docs]
class DateExtractor:
"""Helper class to extract date info from document"""
SYSTEM_MESSAGE = (
"You are a legal scholar that reads ordinance text and extracts "
"structured date information. "
"Return your answer as a dictionary in JSON format (not markdown). "
"Your JSON file must include exactly four keys. The first "
"key is 'explanation', which contains a short summary of the most "
"relevant date information you found in the text. The second key is "
"'year', which should contain an integer value that represents the "
"latest year this ordinance was enacted/updated, or null if that "
"information cannot be found in the text. The third key is 'month', "
"which should contain an integer value that represents the latest "
"month of the year this ordinance was enacted/updated, or null if "
"that information cannot be found in the text. The fourth key is "
"'day', which should contain an integer value that represents the "
"latest day of the month this ordinance was enacted/updated, or null "
"if that information cannot be found in the text. Only provide values "
"if you are confident that they represent the latest date this "
"ordinance was enacted/updated"
)
def __init__(self, structured_llm_caller, text_splitter=None):
"""
Parameters
----------
structured_llm_caller : compass.llm.StructuredLLMCaller
StructuredLLMCaller instance. Used for structured validation
queries.
text_splitter : langchain.text_splitter.TextSplitter, optional
Optional text splitter instance to attach to doc (used for
splitting out pages in an HTML document).
By default, ``None``.
"""
self.slc = structured_llm_caller
self.text_splitter = text_splitter
[docs]
async def parse(self, doc):
"""Extract date (year, month, day) from doc
Parameters
----------
doc : elm.web.document.BaseDocument
Document with a `raw_pages` attribute.
Returns
-------
tuple
3-tuple containing year, month, day, or ``None`` if any of
those are not found.
"""
if hasattr(doc, "text_splitter") and self.text_splitter is not None:
old_splitter = doc.text_splitter
doc.text_splitter = self.text_splitter
out = await self._parse(doc)
doc.text_splitter = old_splitter
return out
return await self._parse(doc)
async def _parse(self, doc):
"""Extract date (year, month, day) from doc"""
url = doc.attrs.get("source")
can_check_url_for_date = url and not any(
sub_str in url for sub_str in _BANNED_DATE_DOMAINS
)
if can_check_url_for_date:
logger.debug("Checking URL for date: %s", url)
response = await self.slc.call(
sys_msg=self.SYSTEM_MESSAGE,
content=(
"Please extract the date from the URL for this "
f"ordinance, if possible:\n{url}"
),
usage_sub_label=LLMUsageCategory.DATE_EXTRACTION,
)
if response:
date = _parse_date([response])
logger.debug("Parsed date from URL: %s", str(date))
return date
if not doc.raw_pages:
return None, None, None
all_years = []
for text in doc.raw_pages:
if not text:
continue
response = await self.slc.call(
sys_msg=self.SYSTEM_MESSAGE,
content=f"Please extract the date for this ordinance:\n{text}",
usage_sub_label=LLMUsageCategory.DATE_EXTRACTION,
)
if not response:
continue
all_years.append(response)
return _parse_date(all_years)
def _parse_date(json_list):
"""Parse all date elements
True date is determined to be the most frequent date. In the case of
a tie, the latest date is chosen.
"""
if not json_list:
return None, None, None
years = _parse_date_element(
json_list,
key="year",
max_len=4,
min_val=2000,
max_val=datetime.now().year,
)
months = _parse_date_element(
json_list, key="month", max_len=2, min_val=1, max_val=12
)
days = _parse_date_element(
json_list, key="day", max_len=2, min_val=1, max_val=31
)
date_elements = Counter(zip(years, months, days, strict=False))
date = max(date_elements, key=lambda date: (date_elements[date], date))
return tuple(None if d < 0 else d for d in date)
def _parse_date_element(json_list, key, max_len, min_val, max_val):
"""Parse out a single date element"""
date_elements = [info.get(key) for info in json_list]
logger.debug("key=%r, date_elements=%r", key, date_elements)
return [
int(y)
if y is not None
and len(str(y)) <= max_len
and (min_val <= int(y) <= max_val)
else -1 * float("inf")
for y in date_elements
]