Source code for docria.algorithm

# -*- coding: utf-8 -*-
#
# Copyright 2021 Marcus Klang (marcus.klang@cs.lth.se)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""Functions for various processing purposes"""
from docria.model import Document, Node, NodeLayerCollection, DataTypeEnum, TextSpan, Text, TextSpan
from typing import Set, List, Callable, Tuple, Dict, Optional, Iterator, Iterable, Any
from collections import deque, namedtuple, defaultdict
import functools


[docs]def get_prop(prop, default=None): """First order function which can be used to extract property of nodes""" def get(n: Node): return default if prop not in n else n[prop] return get
[docs]def chain(*fns): """Create a new function for a sequence of functions which will be applied in sequence""" def forward(x): for fn in fns: x = fn(x) return x return forward
[docs]def children_of(layer: NodeLayerCollection, *props): """Get children of a given property Note: the code will check against schema if it is an array or single node.""" def node_iter(prop): def yielder(n: Node): if prop in n: yield n[prop] return yielder def node_array_iter(prop): def yielder(n: Node): if prop in n: return iter(n[prop]) else: return iter(()) # Empty iterator return yielder def compose(yielders): def yielder(n: Node): for yielder in yielders: for n in yielder(n): yield n return yielder if len(props) == 1: typedef = layer.schema.fields[props[0]] if typedef.typename == DataTypeEnum.NODEREF: return node_iter(props[0]) elif typedef.typename == DataTypeEnum.NODEREF_MANY: return node_array_iter(props[0]) else: raise ValueError("field %s has type %s which is not supported as a children type" % ( props[0], repr(typedef)) ) elif len(props) > 1: yielders = [] for prop in props: typedef = layer.schema.fields[prop] if typedef.typename == DataTypeEnum.NODEREF: yielders.append(node_iter(prop)) elif typedef.typename == DataTypeEnum.NODEREF_MANY: yielders.append(node_array_iter(prop)) else: raise ValueError("field %s has type %s which is not supported as a children type" % ( prop, repr(typedef)) ) return compose(yielders) else: raise ValueError("props has to contain at least one property!")
[docs]def bfs(start: Node, children: Callable[[Node], Iterator[Node]], is_result: Optional[Callable[[Node], bool]]=None)->Iterator[Tuple[int, Node]]: """ Breadth first search :param start: the start node :param children: function returning children iterator for given node :param is_result: optional, function indicating if node should be emitted, default is true for all. :return iterator of found nodes with depth during search """ visited = set() queue = deque() queue.append((0, start)) while queue: current_depth, current_node = queue.popleft() if current_node in visited: continue visited.add(current_node) if is_result and is_result(current_node): yield current_depth, current_node elif not is_result: yield current_depth, current_node for child in children(current_node): if child not in visited: queue.append((current_depth+1, child))
[docs]def dfs(start: Node, children: Callable[[Node], Iterator[Node]], is_result: Optional[Callable[[Node], bool]]=None)->Iterator[Node]: """ Depth first search :param start: start node :param children: function returning children iterator for given node :param is_result: optional, function indicating if node should be emitted, default is true for all. :return iterator of nodes found during search """ visited = set() stack = list() stack.append(start) while stack: current = stack.pop() if current in visited: continue visited.add(current) if is_result and is_result(current): yield current elif not is_result: yield current child_nodes = [ch for ch in children(current) if ch not in visited] child_nodes.reverse() stack.extend(child_nodes)
[docs]def dfs_leaves(start: Node, children: Callable[[Node], Iterator[Node]], is_result: Optional[Callable[[Node], bool]]=None)->Iterator[Node]: """ Depth first search, only returning the leaves i.e. those without children or outgoing links :param start: start node :param children: function returning children iterator for given node :param is_result: optional, function indicating if node should be emitted, default is true for all. :return iterator of nodes found during search """ visited = set() stack = list() stack.append(start) while stack: current = stack.pop() if current in visited: continue visited.add(current) child_nodes = [ch for ch in children(current) if ch not in visited] child_nodes.reverse() if not child_nodes and is_result and is_result(current): yield current elif not child_nodes and not is_result: yield current stack.extend(child_nodes)
[docs]def span_translate(doc: Document, mapping_layer: str, target_source_map: Tuple[str,str], layer_remap: str, source_target_remap: Tuple[str, str]): """ Translate span ranges from a partial extraction to the original data. Target is the original data, Source is the partial extraction ranges. :param doc: document :param mapping_layer: the layer which contains the mapping :param target_source_map: tuple of (target field, source field) :param layer_remap: the layer which should be mapped :param source_target_remap: tuple of (source field, target field) """ target_pos, source_pos = target_source_map source_pos_remap, target_pos_remap = source_target_remap mapping_layer = doc.layer[mapping_layer] assert mapping_layer.schema.fields[target_pos].typename == DataTypeEnum.SPAN assert mapping_layer.schema.fields[source_pos].typename == DataTypeEnum.SPAN layer_remap = doc.layer[layer_remap] assert layer_remap.schema.fields[source_pos_remap].typename == DataTypeEnum.SPAN assert layer_remap.schema.fields[target_pos_remap].typename == DataTypeEnum.SPAN target_text = doc.texts[mapping_layer.schema.fields[target_pos].options["context"]] # 1. Find start/end point interval intersections against mapping # 1.1 Produce mapping array mapping_in_source = [] for m in mapping_layer: sourceStart = m[source_pos].start sourceStop = m[source_pos].stop mapping_in_source.append(((sourceStart, 0), (0, m))) mapping_in_source.append(((sourceStop, -1), (0, m))) for n in layer_remap: if target_pos_remap not in n: sourceRemapStart = n[source_pos_remap].start sourceRemapStop = n[source_pos_remap].stop-1 mapping_in_source.append(((sourceRemapStart, 1), (1, n))) mapping_in_source.append(((sourceRemapStop, 2), (1, n))) mapping_in_source.sort(key=lambda tup: tup[0]) # 2. Translate points with relative distance from start in interval remap_start_offsets = {} active_interval_start = None active_interval_target_start = None for pos, source in mapping_in_source: marker, markertype = pos sourcetype, node = source if sourcetype == 0: if markertype == -1: # End active_interval_start = None active_interval_target_start = None elif markertype == 0: assert active_interval_start is None, "Mapping which overlaps is not allowed!" active_interval_start = marker active_interval_target_start = node[target_pos].start else: assert False, "Bug! Should never happen." elif sourcetype == 1: assert active_interval_start is not None, "Current position %d is outside any " \ "mapping interval, i.e. there is a gap in the mapping!" % marker if markertype == 1: remap_start_offsets[node._id] = (marker - active_interval_start) + active_interval_target_start elif markertype == 2: assert node._id in remap_start_offsets, "Start was not encountered, possibly input data invalid or bug!" stopOffset = (marker - active_interval_start) + active_interval_target_start + 1 startOffset = remap_start_offsets[node._id] node[target_pos_remap] = target_text[startOffset:stopOffset]
[docs]def is_covered_by(span_a: TextSpan, span_b: TextSpan)->bool: """ Covered by predicate :param span_a: the node that is tested for cover :param span_b: the node that might cover span_a :return: true or false """ return span_a.start >= span_b.start and span_a.stop <= span_b.stop
[docs]def group_by_span(group_nodes: List[Node], layer_nodes: Dict[str, Iterable[Node]], resolution="intersect", group_span_field="text", layer_span_field: Optional[Dict[str, str]]=None, include_empty_groups=True)\ ->List[Tuple[Node, Dict[str, List[Node]]]]: """ Groups all nodes in layer_nodes into the corresponding bucket_node Nodes with textspans that equals to NIL/None are ignored. :param group_nodes: the nodes to group by :param layer_nodes: the nodes to assign to zero or more groups :param resolution: which resolution algorithm that shall be used: *intersect* or *cover* * "**intersect**": the identity function for resolutions (all intersects are grouped) * "**cover**": imposes a requirement that the group node must fully cover the layer node \ (node_start >= group_start and node_stop <= group_stop) :param group_span_field: name of textspan property name, *default field* is "text" :param layer_span_field: dictionary {layer: field name for textspan}, *default field* is "text" :param include_empty_groups: include groups which does not contain any matching layer nodes :return List of tuples: (group node, dictionary with layer name -> [ content of group for this layer ]) """ if layer_span_field is None: layer_span_field = defaultdict(lambda: "text") node_list = [] # type: List[Tuple[Tuple[int, int], Tuple[Optional[str], Node]]] # 1. Convert all nodes to Start, Stop symbols with added context information for group_node in group_nodes: if group_span_field in group_node: span = group_node[group_span_field] # type: TextSpan if span.start == span.stop: # singleton node_list.append(((span.start, 3), (None, group_node))) node_list.append(((span.stop, 4), (None, group_node))) else: node_list.append(((span.start, 0), (None, group_node))) node_list.append(((span.stop, -2), (None, group_node))) for layer_name, layer in layer_nodes.items(): try: span_name = layer_span_field[layer_name] except KeyError as e: raise KeyError("Could not find span property name for layer: %s" % layer_name) from e for layer_node in layer: if span_name in layer_node: span = layer_node[span_name] # type: TextSpan if span.start == span.stop: # singleton node_list.append(((span.start, 3), (layer_name, layer_node))) node_list.append(((span.stop, 4), (layer_name, layer_node))) else: node_list.append(((span.start, 1), (layer_name, layer_node))) node_list.append(((span.stop, -1), (layer_name, layer_node))) # 2. Sort by start, stop node_list.sort(key=lambda tup: tup[0]) node_list_groups = [] # type: List[List[Tuple[Tuple[int,int], Tuple[Optional[str], Node]]]] current_group = None for tup in node_list: if tup[0] is not None and current_group == tup[0]: node_list_groups[-1].append(tup) else: node_list_groups.append([tup]) # 3. Run sweep, and assign all groups relevant nodes groups = dict() # type: Dict[Node, Dict[str, List[Node]]] group_list = list() # type: List[Tuple[Node, Dict[str, List[Node]]]] open_nodes = set() open_groups = set() k = 0 while k < len(node_list_groups): nodes = node_list_groups[k] if 0 <= nodes[0][0][1] < 4: # Group start for _, (layer, node) in nodes: if layer is not None: open_nodes.add((layer, node)) for open_group in open_groups: groups[open_group][layer].append(node) else: group_dict = {k: [] for k in layer_nodes.keys()} groups[node] = group_dict group_list.append((node, group_dict)) open_groups.add(node) for open_layer, open_node in open_nodes: group_dict[open_layer].append(open_node) else: # Group stop for _, (layer, node) in nodes: if layer is not None: open_nodes.remove((layer, node)) else: open_groups.remove(node) k += 1 # 4. Apply resolution algorithm if necessary if resolution == "cover": for i in range(len(group_list)): group_node, layer_group_nodes = group_list[i] group_span = group_node[group_span_field] group_list[i] = (group_node, { k: [n for n in v if is_covered_by(n[layer_span_field[k]], group_span)] for k, v in layer_group_nodes.items() }) # 5. Final filtering if necessary if not include_empty_groups: group_list = [grp for grp in group_list if sum(map(len, grp[1].values())) > 0] # 4. Return result return group_list
[docs]def dominant_right(segments: List[Tuple[int, int, Any]])->List[Any]: """ Resolves overlapping segments by using the dominant right rule, i.e. the longest wins and if equal length, the rightmost wins. :param segments: tuple of (start, stop, data) :return: list of data """ segment_list = [] for tup in segments: start, stop, item = tup segment_list.append(((start, 0), tup)) if start != stop: segment_list.append(((stop-1, 1), tup)) else: segment_list.append(((stop, 1), tup)) segment_list.sort(key=lambda el: el[0]) segment_output = [] open_node = None for ((off, mode), tup) in segment_list: if mode == 0: if open_node is not None: start, stop, item = open_node if stop-start <= tup[1]-tup[0]: open_node = tup else: open_node = tup else: if open_node is tup: segment_output.append(tup[2]) open_node = None return segment_output
[docs]def dominant_right_span(nodes: Iterable[Node], spanfield: str="text")->List[Node]: """ Resolves overlapping spans by using the dominant right rule, i.e. the longest wins and if equal length, the rightmost wins. :param nodes: nodes to resolve :param spanfield: the name of the spanfield :return: list of nodes """ segments = [(n[spanfield].start, n[spanfield].stop, n) for n in nodes if spanfield in n] return dominant_right(segments)
[docs]def sequence_to_textspans(token_sequence: List[str], text: Text, start_offset: int = 0, stop_offset: Optional[int] = None, k: int = 1) -> List[TextSpan]: """ Convert a sequence of strings, e.g. produced by a tokenizer and return matching textspans in a raw text. :param token_sequence: sequence of strings to find :param text: the raw text to search in :param start_offset: the starting offset, default is from the start :param stop_offset: the stop offset, default is to the end :param k: maximum number of tokens to skip to search for better matching tokens (if a token is not present in text, k = 1 will test one token ahead and if it is closed select this one instead) :return: list of spans, the spans which could not be found will have zero length at last position """ output = [] raw_text = str(text)[start_offset:] if stop_offset is None else str(text)[start_offset:stop_offset] pos = 0 i = 0 while i < len(token_sequence): location = raw_text.find(token_sequence[i], pos, stop_offset) if location == -1: # Zero length mapping output.append(text[start_offset+pos:start_offset+pos]) else: # Verify forward that none of the k forward can be found before (not <=, just <) this match found_closer = False for j in range(1, min(k+1, len(token_sequence)-i)): cand_location = raw_text.find(token_sequence[i+j], pos, stop_offset) if cand_location != -1 and cand_location < location: # Found something closer: Add zero length mappings for words in between. for ic in range(i, i+j): output.append(text[start_offset+pos:start_offset+pos]) # Add our match output.append(text[(start_offset+cand_location):(start_offset+cand_location+len(token_sequence[i+j]))]) # Move pos and i forward pos = cand_location + len(token_sequence[i+j]) i = i + j found_closer = True break if not found_closer: # Standard match output.append(text[(start_offset+location):(start_offset+location+len(token_sequence[i]))]) pos = location + len(token_sequence[i]) i += 1 return output