Source code for buildingmotif.label_parsing.parser

from abc import ABC, abstractmethod
from collections import defaultdict
from dataclasses import dataclass, field
from typing import Dict, List, Tuple

from buildingmotif.label_parsing.tokens import Constant, Identifier, Null, TokenResult

# TODO: programming by example?
# TODO: get LLM to write a parser for the point labels, given a human description


# type definition for the output of a parser function
[docs]@dataclass(frozen=True) class ParseResult: tokens: List[TokenResult] success: bool _errors: List[str] = field(default_factory=list) @property def errors(self): """Return a list of errors and the offset into the string where the error occurred.""" errors = [] offset = 0 for t in self.tokens: if t.error: errors.append((t.error, offset)) offset += t.length return errors
# type definition for the parser functions. # A parser function takes a string and returns a list of tuples # each tuple is a token, the type of the token, and the length of the token # the length of the token is used to keep track of how much of the string # has been parsed
[docs]class Parser(ABC): __args__: dict def __new__(mcls, *args, **kwargs): """When a parser is constructed, save its arguments into a dictionary __args__. This allows parsers to be serialized later without requiring bespoke (de)serialization code for every parser type.""" cls = super().__new__(mcls) init_var_names = list(cls.__init__.__code__.co_varnames[1:]) rem_var_names = init_var_names for key in kwargs.keys(): if key in init_var_names: rem_var_names.remove(key) rem_var_count = len(rem_var_names) cls.__args__ = kwargs if len(args) > rem_var_count: args = [*args[: rem_var_count - 1], args[rem_var_count - 1 :]] cls.__args__.update(dict(zip(init_var_names, args))) return cls @abstractmethod def __call__(self, target: str) -> List[TokenResult]: pass
# function which takes the results of parse_list and turns all of the # results (not failures) into token dictionaries. # A token dictionary has an 'identifier' key which is the label (the keys # of the results dictionary), and then a list of tokens. A token is a dictionary # with 'identifier' (the substring part of the result) and 'type' (the type of # the result if it is a constant)
[docs]def results_to_tokens(results): tokens = [] for r in results: res = {"label": r, "tokens": []} parts = iter(results[r]) first = None while True: try: # get first constant or identifier token using itertools first = first_true( parts, pred=lambda x: isinstance(x.token, (Constant, Identifier)) ) # get the next constant or identifier token second = first_true( parts, pred=lambda x: isinstance(x.token, (Constant, Identifier)) ) if not first or not second: break # add the constant and identifier to the token dictionary identifier = first if isinstance(first.token, Identifier) else second constant = first if isinstance(first.token, Constant) else second res["tokens"].append( { "identifier": identifier.token.value, "type": constant.token.value.toPython(), } ) except StopIteration: break if first is None: # if there are any constants left, add them to the token dictionary with the label first = first_true(parts, pred=lambda x: isinstance(x.token, Constant)) if first: res["tokens"].append( {"identifier": r, "type": first.token.value.toPython()} ) tokens.append(res) return tokens
# Analyzes the failures of a parser to capture all point labels. # For each label in the failures, compute the length of the found tokens. # Create a dictionary keyed with the label. The value should have two keys. # The first key is the remaining "unparsed" part of the string, the second key # is the set of tokens that were found.
[docs]def analyze_failures(failures: Dict[str, List[TokenResult]]): """Analyze the failures of a parser.""" analyzed = {} for failure in failures: tokens = failures[failure] length = sum([t.length for t in tokens]) analyzed[failure] = { "unparsed": failure[length:], "tokens": [ {"identifier": t.token.value, "type": t.token.value} for t in tokens ], } # group the points by the unparsed portion grouped = defaultdict(list) for f in analyzed: grouped[analyzed[f]["unparsed"]].append(f) # for each group, add the tokens to the first point in the group return grouped
# wrapper function for reading a list of strings # applies a given parser to each string in the list # returns a dictionary of the input strings to the result of the parser # Keep track of all strings that fail to parse and return them in a list
[docs]def parse_list( parser, target_list ) -> Tuple[Dict[str, List[TokenResult]], Dict[str, List[TokenResult]]]: """ Parse a list of strings using the given parser. :param parser: the parsing combinator function :type parser: Parser :param target_list: the list of strings to parse :type target_list: List[str] :return: a tuple of the results and failures """ results = {} failed = {} for target in target_list: result = parse(parser, target) if result.success: results[target] = result.tokens else: failed[target] = result.tokens return results, failed
# wrapper function for a parser that does the following: # - apply the parser to the target # - if the parser does not consume all the target, raise an error # - return the result of the parser
[docs]def parse(parser: Parser, target: str) -> ParseResult: """ Parse the given target string using the given parser. :param parser: the parsing combinator function :type parser: Parser :param target: the target string to parse :type target: str :return: the result of the parser :rtype: ParseResult """ result = parser(target) # remove empty Null tokens from result result = [r for r in result if r.error or (not isinstance(r.token, Null))] # check length of target vs length of all results total_length = sum([r.length for r in result]) return ParseResult( result, total_length == len(target), [r.error for r in result if r.error] )
# from itertools documentation
[docs]def first_true(iterable, default=None, pred=None): """Returns the first true value in the iterable. If no true value is found, returns *default* If *pred* is not None, returns the first item for which pred(item) is true. """ # first_true([a,b,c], x) --> a or b or c or x # first_true([a,b], x, f) --> a if f(a) else b if f(b) else x return next(filter(pred, iterable), default)