Source code for compass.extraction.small_wind.ordinance
"""Small wind ordinance document content collection and extraction
These methods help filter down the document text to only the portions
relevant to small wind ordinances.
"""
import logging
from compass.common import BaseTextExtractor
from compass.validation.content import Heuristic
from compass.llm.calling import StructuredLLMCaller
from compass.utilities.enums import LLMUsageCategory
from compass.utilities.parsing import merge_overlapping_texts
logger = logging.getLogger(__name__)
_SMALL_WES_SYNONYMS = (
"small wind energy turbines (SWET), non-commercial wind energy systems, "
"on-site wind energy systems, distributed wind energy systems, medium "
"wind energy systems (MWES), agricultural wind energy systems (AWES), "
"residential wind energy systems, small wind turbines (SWT), "
"or similar"
)
_SEARCH_TERMS_AND = (
"zoning, siting, setback, system design, and operational "
"requirements/restrictions"
)
_SEARCH_TERMS_OR = _SEARCH_TERMS_AND.replace("and", "or")
_IGNORE_TYPES_MICRO = "private, micro, personal, building-mounted"
_IGNORE_TYPES_LARGE = "large, utility-scale, for-sale, commercial"
[docs]
class SmallWindHeuristic(Heuristic):
"""Perform a heuristic check for mention of wind turbines in text"""
NOT_TECH_WORDS = [
"wind farm",
"wind energy farm",
"utility wind energy system",
"commercial wind energy system",
"rewind",
"windbreak",
"windiest",
"winds",
"windshield",
"window",
"windy",
"wind attribute",
"wind blow",
"wind break",
"wind current",
"wind damage",
"wind data",
"wind direction",
"wind draft",
"wind erosion",
"wind energy resource atlas",
"wind load",
"wind movement",
"wind orient",
"wind resource",
"wind runway",
"prevailing wind",
"downwind",
]
"""Words and phrases that indicate text is NOT about WECS"""
GOOD_TECH_KEYWORDS = ["wind", "setback"]
"""Words that indicate we should keep a chunk for analysis"""
GOOD_TECH_ACRONYMS = ["wecs", "wes", "swet", "pwet", "wef"]
"""Acronyms for WECS that we want to capture"""
GOOD_TECH_PHRASES = [
"small wecs",
"small turbine",
"small wind",
"medium wecs",
"medium turbine",
"medium wind",
"accessory wecs",
"accessory turbine",
"accessory wind",
"on-site wecs",
"on-site turbine",
"on-site wind",
"onsite wecs",
"onsite turbine",
"onsite wind",
"on-farm wecs",
"on-farm turbine",
"on-farm wind",
"distributed wecs",
"distributed turbine",
"distributed wind",
"residential wecs",
"residential turbine",
"residential wind",
"agricultural wecs",
"agricultural turbine",
"agricultural wind",
"local wecs",
"local turbine",
"local wind",
"behind-the-meter wecs",
"behind-the-meter turbine",
"behind-the-meter wind",
"front-of-meter wecs",
"front-of-meter turbine",
"front-of-meter wind",
"pwec",
"swecs",
"wind energy conversion",
"wind turbine",
"wind tower",
"wind energy system",
]
"""Phrases that indicate text is about WECS"""
[docs]
class SmallWindOrdinanceTextCollector(StructuredLLMCaller):
"""Check text chunks for ordinances and collect them if they do"""
CONTAINS_ORD_PROMPT = (
"You extract structured data from text. Return your answer in JSON "
"format (not markdown). Your JSON file must include exactly two "
"keys. The first key is 'wind_reqs', which is a string that "
f"summarizes all {_SEARCH_TERMS_AND} that are explicitly enacted "
"in the text for a wind energy system (or wind turbine/tower) for "
"a given jurisdiction. "
"Note that wind energy bans are an important restriction to track. "
"Include any **closely related provisions** if they clearly pertain "
"to the **development, operation, modification, or removal** of wind "
"energy systems (or wind turbines/towers). "
"All restrictions should be enforceable - ignore any text that only "
"provides a legal definition of the regulation. If the text does not "
f"specify any concrete {_SEARCH_TERMS_OR} for a wind energy system, "
"set this key to `null`. The last key is '{key}', which is a boolean "
"that is set to True if the text excerpt explicitly details "
f"{_SEARCH_TERMS_OR} for a wind energy system (or wind turbine/tower) "
"and False otherwise. "
)
"""Prompt to check if chunk contains WES ordinance info"""
IS_SMALL_PROMPT = (
"You are a legal scholar that reads ordinance text and determines "
f"whether any of it applies to {_SEARCH_TERMS_OR} for **small, "
"medium, or non-commercial wind energy systems**. Small, medium, or "
"non-commercial energy systems may also be referred to as "
f"{_SMALL_WES_SYNONYMS}. "
"Your client is a private resident that does not care about "
f"ordinances related to {_IGNORE_TYPES_MICRO} or "
f"{_IGNORE_TYPES_LARGE} wind energy systems. Ignore any text "
"related to such systems. "
"Return your answer as a dictionary in JSON format (not markdown). "
"Your JSON file must include exactly two keys. The first key is "
"'summary' which contains a string that lists all of the types of "
"wind energy systems the text applies to (if any). The second key is "
"'{key}', which is a boolean that is set to True if any part of the "
f"text excerpt details {_SEARCH_TERMS_OR} for the **small, medium, or "
"non-commercial wind energy conversion systems** (or similar) that "
"the client is interested in and False otherwise."
)
"""Prompt to check if chunk is for small WES"""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self._ordinance_chunks = {}
[docs]
async def check_chunk(self, chunk_parser, ind):
"""Check a chunk at a given ind to see if it contains ordinance
Parameters
----------
chunk_parser : ParseChunksWithMemory
Instance that contains a ``parse_from_ind`` method.
ind : int
Index of the chunk to check.
Returns
-------
bool
Boolean flag indicating whether or not the text in the chunk
contains small wind energy conversion system ordinance text.
"""
contains_ord_info = await chunk_parser.parse_from_ind(
ind,
key="contains_ord_info",
llm_call_callback=self._check_chunk_contains_ord,
)
if not contains_ord_info:
logger.debug("Text at ind %d does not contain ordinance info", ind)
return False
logger.debug("Text at ind %d does contain ordinance info", ind)
is_small_scale = await chunk_parser.parse_from_ind(
ind,
key="x",
llm_call_callback=self._check_chunk_is_for_small_scale,
)
if not is_small_scale:
logger.debug("Text at ind %d is not for small WECS", ind)
return False
logger.debug("Text at ind %d is for small WECS", ind)
_store_chunk(chunk_parser, ind, self._ordinance_chunks)
logger.debug("Added text at ind %d to ordinances", ind)
return True
@property
def contains_ord_info(self):
"""bool: Flag indicating whether text contains ordinance info"""
return bool(self._ordinance_chunks)
@property
def ordinance_text(self):
"""str: Combined ordinance text from the individual chunks"""
logger.debug(
"Grabbing %d ordinance chunk(s) from original text at these "
"indices: %s",
len(self._ordinance_chunks),
list(self._ordinance_chunks),
)
text = [
self._ordinance_chunks[ind]
for ind in sorted(self._ordinance_chunks)
]
return merge_overlapping_texts(text)
async def _check_chunk_contains_ord(self, key, text_chunk):
"""Call LLM on a chunk of text to check for ordinance"""
content = await self.call(
sys_msg=self.CONTAINS_ORD_PROMPT.format(key=key),
content=text_chunk,
usage_sub_label=(LLMUsageCategory.DOCUMENT_CONTENT_VALIDATION),
)
logger.debug("LLM response: %s", content)
return content.get(key, False)
async def _check_chunk_is_for_small_scale(self, key, text_chunk):
"""Call LLM on a chunk of text to check for small scale"""
content = await self.call(
sys_msg=self.IS_SMALL_PROMPT.format(key=key),
content=text_chunk,
usage_sub_label=(LLMUsageCategory.DOCUMENT_CONTENT_VALIDATION),
)
logger.debug("LLM response: %s", content)
return content.get(key, False)
[docs]
class SmallWindPermittedUseDistrictsTextCollector(StructuredLLMCaller):
"""Check text chunks for permitted wind districts; collect them"""
DISTRICT_PROMPT = (
"You are a legal scholar that reads ordinance text and determines "
"whether the text explicitly contains relevant information to "
"determine the districts (and especially the district names) where "
"small, medium, or non-commercial wind energy systems are a permitted "
"use (i.e. accessory use), as well as the districts where wind energy"
"systems are prohibited entirely. Small wind energy systems "
f"(SWES) may also be referred to as {_SMALL_WES_SYNONYMS}. "
"Do not make any inferences; only answer based on information that "
"is explicitly stated in the text. "
"Note that relevant information may sometimes be found in tables. "
"Return your answer as a dictionary in JSON format (not markdown). "
"Your JSON file must include exactly two keys. The first key is "
"'districts' which contains a string that lists all of the district "
"names for which the text explicitly permits **small, medium, or "
"non-commercial wind energy systems** (if any). The last key is "
"'{key}', which is a boolean that is set to True if any part of the "
"text excerpt provides information on districts where **small, "
"medium, or non-commercial wind energy systems** (or similar) are a "
"permitted use (i.e. accessory use) in and False otherwise."
)
"""Prompt to check if chunk contains info on permitted districts"""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self._district_chunks = {}
[docs]
async def check_chunk(self, chunk_parser, ind):
"""Check a chunk to see if it contains permitted uses
Parameters
----------
chunk_parser : ParseChunksWithMemory
Instance that contains a ``parse_from_ind`` method.
ind : int
Index of the chunk to check.
Returns
-------
bool
Boolean flag indicating whether or not the text in the chunk
contains small wind energy conversion system permitted use
text.
"""
key = "contains_district_info"
content = await self.call(
sys_msg=self.DISTRICT_PROMPT.format(key=key),
content=chunk_parser.text_chunks[ind],
usage_sub_label=(
LLMUsageCategory.DOCUMENT_PERMITTED_USE_CONTENT_VALIDATION
),
)
logger.debug("LLM response: %s", content)
contains_district_info = content.get(key, False)
if contains_district_info:
_store_chunk(chunk_parser, ind, self._district_chunks)
logger.debug("Text at ind %d contains district info", ind)
return True
logger.debug("Text at ind %d does not contain district info", ind)
return False
@property
def contains_district_info(self):
"""bool: Flag indicating whether text contains district info"""
return bool(self._district_chunks)
@property
def permitted_use_district_text(self):
"""str: Combined permitted use districts text from the chunks"""
logger.debug(
"Grabbing %d permitted use chunk(s) from original text at "
"these indices: %s",
len(self._district_chunks),
list(self._district_chunks),
)
text = [
self._district_chunks[ind] for ind in sorted(self._district_chunks)
]
return merge_overlapping_texts(text)
[docs]
class SmallWindOrdinanceTextExtractor(BaseTextExtractor):
"""Extract succinct ordinance text from input
Purpose:
Extract relevant ordinance text from document.
Responsibilities:
1. Extract portions from chunked document text relevant to
particular ordinance type (e.g. wind zoning for small wind
systems).
Key Relationships:
Uses a StructuredLLMCaller for LLM queries.
"""
WIND_ENERGY_SYSTEM_FILTER_PROMPT = (
"# CONTEXT #\n"
"We want to reduce the provided excerpt to only contain information "
"about **wind energy systems**. The extracted text will be used for "
"structured data extraction, so it must be both **comprehensive** "
"(retaining all relevant details) and **focused** (excluding "
"unrelated content), with **zero rewriting or paraphrasing**. "
"Ensure that all retained information is "
"**directly applicable to wind energy systems** while preserving "
"full context and accuracy.\n"
"\n# OBJECTIVE #\n"
"Extract all text **pertaining to wind energy systems** from the "
"provided excerpt.\n"
"\n# RESPONSE #\n"
"Follow these guidelines carefully:\n"
"\n1. ## Scope of Extraction ##:\n"
"- Include all text that pertains to **wind energy systems**.\n"
"- Explicitly include any text related to **bans or prohibitions** "
"on wind energy systems.\n"
"- Explicitly include any text related to the adoption or enactment "
"date of the ordinance (if any).\n"
"\n2. ## Exclusions ##:\n"
"- Do **not** include text that does not pertain to wind energy "
"systems.\n"
"\n3. ## Formatting & Structure ##:\n"
"- **Preserve _all_ section titles, headers, and numberings** for "
"reference.\n"
"- **Maintain the original wording, formatting, and structure** to "
"ensure accuracy.\n"
"\n4. ## Output Handling ##:\n"
"- This is a strict extraction task — act like a text filter, **not** "
"a summarizer or writer.\n"
"- Do not add, explain, reword, or summarize anything.\n"
"- The output must be a **copy-paste** of the original excerpt.\n"
"**Absolutely no paraphrasing or rewriting.**\n"
"- The output must consist **only** of contiguous or discontiguous "
"verbatim blocks copied from the input.\n"
"- If **no relevant text** is found, return the response: "
"'No relevant text.'"
)
"""Prompt to extract ordinance text for WECS"""
SMALL_WIND_ENERGY_SYSTEM_SECTION_FILTER_PROMPT = (
"# CONTEXT #\n"
"We want to reduce the provided excerpt to only contain information "
"about **small, medium, or non-commercial wind energy systems**. The "
"extracted text will be used for structured data extraction, so it "
"must be both **comprehensive** (retaining all relevant details) and "
"**focused** (excluding unrelated content), with **zero rewriting or "
"paraphrasing**. Ensure that all retained information "
"is **directly applicable** to small, medium, or non-commercial wind "
"energy systems while preserving full context and accuracy.\n"
"\n# OBJECTIVE #\n"
"Extract all text **pertaining to small, medium or non-commercial "
"wind energy systems** from the provided excerpt.\n"
"\n# RESPONSE #\n"
"Follow these guidelines carefully:\n"
"\n1. ## Scope of Extraction ##:\n"
"- Include all text that pertains to **small, medium, or "
"non-commercial wind energy systems**, even if they are referred to "
"by different names such as:\n"
f"\t{_SMALL_WES_SYNONYMS.capitalize()}.\n"
"- Explicitly include any text related to **bans or prohibitions** "
"on small, medium, or non-commercial wind energy systems.\n"
"- Explicitly include any text related to the adoption or enactment "
"date of the ordinance (if any).\n"
"- **Retain all relevant technical, design, operational, safety, "
"environmental, and infrastructure-related provisions** that apply "
"to the topic, such as (but not limited to):\n"
"\t- Compliance with legal or regulatory standards.\n"
"\t- Site, structural, or design specifications.\n"
"\t- Environmental impact considerations.\n"
"\t- Safety and risk mitigation measures.\n"
"\t- Infrastructure, implementation, operation, and maintenance "
"details.\n"
"\t- All other **closely related provisions**.\n"
"\n2. ## Exclusions ##:\n"
"- Do **not** include text that explicitly applies **only** to "
f"{_IGNORE_TYPES_MICRO} or {_IGNORE_TYPES_LARGE} "
"wind energy systems.\n"
f"- Do **not** include text that does not pertain at all to wind "
"energy systems.\n"
"\n3. ## Formatting & Structure ##:\n"
"- **Preserve _all_ section titles, headers, and numberings** for "
"reference.\n"
"- **Maintain the original wording, formatting, and structure** to "
"ensure accuracy.\n"
"\n4. ## Output Handling ##:\n"
"- This is a strict extraction task — act like a text filter, **not** "
"a summarizer or writer.\n"
"- Do not add, explain, reword, or summarize anything.\n"
"- The output must be a **copy-paste** of the original excerpt.\n"
"**Absolutely no paraphrasing or rewriting.**\n"
"- The output must consist **only** of contiguous or discontiguous "
"verbatim blocks copied from the input.\n"
"- If **no relevant text** is found, return the response: "
"'No relevant text.'"
)
"""Prompt to extract ordinance text for small WECS"""
[docs]
async def extract_wind_energy_system_section(self, text_chunks):
"""Extract ordinance text from input text chunks for WES
Parameters
----------
text_chunks : list of str
List of strings, each of which represent a chunk of text.
The order of the strings should be the order of the text
chunks.
Returns
-------
str
Ordinance text extracted from text chunks.
"""
return await self._process(
text_chunks=text_chunks,
instructions=self.WIND_ENERGY_SYSTEM_FILTER_PROMPT,
is_valid_chunk=_valid_chunk,
)
[docs]
async def extract_small_wind_energy_system_section(self, text_chunks):
"""Extract small WES ordinance text from input text chunks
Parameters
----------
text_chunks : list of str
List of strings, each of which represent a chunk of text.
The order of the strings should be the order of the text
chunks.
Returns
-------
str
Ordinance text extracted from text chunks.
"""
return await self._process(
text_chunks=text_chunks,
instructions=self.SMALL_WIND_ENERGY_SYSTEM_SECTION_FILTER_PROMPT,
is_valid_chunk=_valid_chunk,
)
@property
def parsers(self):
"""Iterable of parsers provided by this extractor
Yields
------
name : str
Name describing the type of text output by the parser.
parser : callable
Async function that takes a ``text_chunks`` input and
outputs parsed text.
"""
yield (
"wind_energy_systems_text",
self.extract_wind_energy_system_section,
)
yield (
"cleaned_ordinance_text",
self.extract_small_wind_energy_system_section,
)
[docs]
class SmallWindPermittedUseDistrictsTextExtractor(BaseTextExtractor):
"""Extract succinct ordinance text from input
Purpose:
Extract relevant ordinance text from document.
Responsibilities:
1. Extract portions from chunked document text relevant to
particular ordinance type (e.g. wind zoning for small wind
systems).
Key Relationships:
Uses a StructuredLLMCaller for LLM queries.
"""
_USAGE_LABEL = LLMUsageCategory.DOCUMENT_PERMITTED_USE_DISTRICTS_SUMMARY
PERMITTED_USES_FILTER_PROMPT = (
"# CONTEXT #\n"
"We want to reduce the provided excerpt to only contain information "
"detailing permitted use(s) for a district. The extracted text will "
"be used for structured data extraction, so it must be both "
"**comprehensive** (retaining all relevant details) and **focused** "
"(excluding unrelated content), with **zero rewriting or "
"paraphrasing**. Ensure that all retained information "
"is **directly applicable** to permitted use(s) for one or more "
"districts while preserving full context and accuracy.\n"
"\n# OBJECTIVE #\n"
"Remove all text **not directly pertinent** to permitted use(s) for "
"a district.\n"
"\n# RESPONSE #\n"
"Follow these guidelines carefully:\n"
"\n1. ## Scope of Extraction ##:\n"
"- Retain all text defining permitted use(s) for a district, "
"including:\n"
"\t- **Primary, Special, Conditional, Accessory, Prohibited, and "
"any other use types.**\n"
"\t- **District names and zoning classifications.**\n"
"- Pay extra attention to any references to **wind energy "
"facilities** or related terms.\n"
"- Ensure that **tables, lists, and structured elements** are "
"preserved as they may contain relevant details.\n"
"\n2. ## Exclusions ##:\n"
"- Do **not** include unrelated regulations, procedural details, "
"or non-use-based restrictions.\n"
"\n3. ## Formatting & Structure ##:\n"
"- **Preserve _all_ section titles, headers, and numberings** for "
"reference, **especially if they contain the district name**.\n"
"- **Maintain the original wording, formatting, and structure** to "
"ensure accuracy.\n"
"\n4. ## Output Handling ##:\n"
"- This is a strict extraction task — act like a text filter, **not** "
"a summarizer or writer.\n"
"- Do not add, explain, reword, or summarize anything.\n"
"- The output must be a **copy-paste** of the original excerpt.\n"
"**Absolutely no paraphrasing or rewriting.**\n"
"- The output must consist **only** of contiguous or discontiguous "
"verbatim blocks copied from the input.\n"
"- If **no relevant text** is found, return the response: "
"'No relevant text.'"
)
"""Prompt to extract ordinance text for permitted uses"""
WES_PERMITTED_USES_FILTER_PROMPT = (
"# CONTEXT #\n"
"We want to reduce the provided excerpt to only contain information "
"detailing **wind energy system** permitted use(s) for a district. "
"The extracted text will be used for structured data extraction, so "
"it must be both **comprehensive** (retaining all relevant details) "
"and **focused** (excluding unrelated content), with **zero rewriting "
"or paraphrasing**. Ensure that all "
"retained information is **directly applicable** to permitted use(s) "
"for wind energy systems in one or more districts while "
"preserving full context and accuracy.\n"
"\n# OBJECTIVE #\n"
"Remove all text **not directly pertinent** to wind energy conversion "
"system permitted use(s) for a district.\n"
"\n# RESPONSE #\n"
"Follow these guidelines carefully:\n"
"\n1. ## Scope of Extraction ##:\n"
"- Retain all text defining permitted use(s) for a district, "
"including:\n"
"\t- **Primary, Special, Conditional, Accessory, Prohibited, and "
"any other use types.**\n"
"\t- **District names and zoning classifications.**\n"
"- Ensure that **tables, lists, and structured elements** are "
"preserved as they may contain relevant details.\n"
"\n2. ## Exclusions ##:\n"
"- Do not include text that does not pertain at all to wind "
"energy systems.\n"
"\n3. ## Formatting & Structure ##:\n"
"- **Preserve _all_ section titles, headers, and numberings** for "
"reference, **especially if they contain the district name**.\n"
"- **Maintain the original wording, formatting, and structure** to "
"ensure accuracy.\n"
"\n4. ## Output Handling ##:\n"
"- This is a strict extraction task — act like a text filter, **not** "
"a summarizer or writer.\n"
"- Do not add, explain, reword, or summarize anything.\n"
"- The output must be a **copy-paste** of the original excerpt.\n"
"**Absolutely no paraphrasing or rewriting.**\n"
"- The output must consist **only** of contiguous or discontiguous "
"verbatim blocks copied from the input.\n"
"- If **no relevant text** is found, return the response: "
"'No relevant text.'"
)
"""Prompt to extract ordinance text for permitted uses for WECS"""
[docs]
async def extract_permitted_uses(self, text_chunks):
"""Extract permitted uses text from input text chunks
Parameters
----------
text_chunks : list of str
List of strings, each of which represent a chunk of text.
The order of the strings should be the order of the text
chunks.
Returns
-------
str
Ordinance text extracted from text chunks.
"""
return await self._process(
text_chunks=text_chunks,
instructions=self.PERMITTED_USES_FILTER_PROMPT,
is_valid_chunk=_valid_chunk,
)
[docs]
async def extract_wes_permitted_uses(self, text_chunks):
"""Extract permitted uses text for small WES from input text
Parameters
----------
text_chunks : list of str
List of strings, each of which represent a chunk of text.
The order of the strings should be the order of the text
chunks.
Returns
-------
str
Ordinance text extracted from text chunks.
"""
return await self._process(
text_chunks=text_chunks,
instructions=self.WES_PERMITTED_USES_FILTER_PROMPT,
is_valid_chunk=_valid_chunk,
)
@property
def parsers(self):
"""Iterable of parsers provided by this extractor
Yields
------
name : str
Name describing the type of text output by the parser.
parser : callable
Async function that takes a ``text_chunks`` input and
outputs parsed text.
"""
yield "permitted_use_only_text", self.extract_permitted_uses
yield "districts_text", self.extract_wes_permitted_uses
def _valid_chunk(chunk):
"""True if chunk has content"""
return chunk and "no relevant text" not in chunk.lower()
def _store_chunk(parser, chunk_ind, store):
"""Store chunk and its neighbors if it is not already stored"""
for offset in range(1 - parser.num_to_recall, 2):
ind_to_grab = chunk_ind + offset
if ind_to_grab < 0 or ind_to_grab >= len(parser.text_chunks):
continue
store.setdefault(ind_to_grab, parser.text_chunks[ind_to_grab])