Source code for pybrat

import re
import os
import html
import json
import warnings
from copy import deepcopy
from collections import defaultdict
from pathlib import Path
from typing import List


[docs]class Annotation(object):
    """
    The base class for brat annotations.
    Use Span, Event, or Attribute instead of this class.
    """
    def __init__(self, _id: str, _type: str, _source_file: str = None):
        assert isinstance(_id, str)
        assert isinstance(_type, str)
        assert isinstance(_source_file, (type(None), str))
        self._id = _id
        self._type = _type
        self._source_file = _source_file
        if _source_file is not None:
            self._source_file = os.path.basename(_source_file)

    @property
    def id(self):
        return self._id

    @property
    def type(self):
        return self._type

[docs]    def update(self, key, value):
        self.__dict__[key] = value

    def __eq__(self, other):
        raise NotImplementedError()

    def __hash__(self):
        raise NotImplementedError()

    def __str__(self):
        return self.to_brat_str()

    def __repr__(self):
        field_strings = []
        for (k, v) in self.__dict__.items():
            if k.startswith('_'):
                continue
            if isinstance(v, Annotation):
                v_rep = v.short_repr()
            elif isinstance(v, dict):
                repr_dict = {}
                for (sub_k, sub_v) in v.items():
                    if isinstance(sub_v, Annotation):
                        sub_v_rep = sub_v.short_repr()
                    else:
                        sub_v_rep = repr(sub_v)
                    repr_dict[sub_k] = sub_v_rep
                v_rep = repr(repr_dict)
            elif isinstance(v, (list, tuple)):
                repr_list = []
                for sub_v in v:
                    if isinstance(sub_v, Annotation):
                        sub_v_rep = sub_v.short_repr()
                    else:
                        sub_v_rep = repr(sub_v)
                    repr_list.append(sub_v_rep)
                v_rep = repr(repr_list)
            else:
                v_rep = repr(v)
            field_strings.append(f"{k}: {v_rep}")
        fields_str = ', '.join(field_strings)
        class_name = str(self.__class__).split('.')[-1][:-2]
        rep = f"{class_name}({fields_str})"
        return rep

[docs]    def short_repr(self):
        class_name = str(self.__class__).split('.')[-1][:-2]
        contents = [f"id: {self.id})"]
        if "_type" in self.__dict__:
            contents.insert(0, f"type: {self.type}")
        contents_str = ', '.join(contents)
        return f"{class_name}({contents_str})"

[docs]    def copy(self):
        """
        Performs a deep copy of this annotation.
        """
        return deepcopy(self)

    @staticmethod
    def _resolve_file_path(path):
        try:
            here = Path(path).resolve()
            abspath = str(here.absolute())
        except TypeError:
            abspath = path
        return abspath

[docs]    def to_brat_str(self):
        raise NotImplementedError()


[docs]class CharacterIndex(object):
    """
    :param list(tuple) sorted_spans: a list of tuples, each tuple containing
                                     the (start, end) indices of a text span.
    """

    def __init__(self, sorted_spans):
        self.sorted_spans = sorted_spans

    @property
    def start_index(self):
        if len(self.sorted_spans) > 0:
            return self.sorted_spans[0][0]
        return 0

    @property
    def end_index(self):
        if len(self.sorted_spans) > 0:
            return self.sorted_spans[-1][-1]
        return 0

    def __eq__(self, other):
        if not isinstance(other, CharacterIndex):
            return False
        return all([self_span == other_span for (self_span, other_span)
                    in zip(self.sorted_spans, other.sorted_spans)])

    def __iter__(self):
        return iter([idx for span in self.sorted_spans
                     for idx in range(*span)])

    def __add__(self, other):
        if not isinstance(other, CharacterIndex):
            raise TypeError("other element must be CharacterIndex.")
        spans = self.sorted_spans + other.sorted_spans
        return CharacterIndex(spans)

    def __hash__(self):
        return hash(str(self.sorted_spans))

    def __str__(self):
        return ';'.join([f"{span[0]} {span[1]}" for span in self.sorted_spans])

    def __repr__(self):
        return f"CharacterIndex({str(self.sorted_spans)})"


[docs]class Span(Annotation):
    """
    A brat span. I.e., a span of text.

    :param str _id: the unique numerical identifier of this span with the 'T'
                    prefix. E.g., 'T3'.
    :param CharacterIndex indices: the CharacterIndex for this span.
    :param str text: the actual span text
    :param str _type: (Optional) a string giving the type of this span,
                      e.g., for NER. Default is 'Span'.
    :param str _source_file: (Optional), the name of the .ann file which
                             contains this span.
    """
    def __init__(self, _id: str, indices: CharacterIndex,
                 text: str, _type: str = "Span", _source_file: str = None,
                 attributes=None):
        super().__init__(_id=_id, _type=_type, _source_file=_source_file)
        assert isinstance(indices, CharacterIndex)
        assert isinstance(text, str)
        self.indices = indices
        self.text = text
        self.attributes = {}
        attributes = attributes or {}
        for attr in attributes.values():
            attr.reference = self
        self.attributes = attributes

    @property
    def start_index(self):
        return self.indices.start_index

    @property
    def end_index(self):
        return self.indices.end_index

    def __eq__(self, other):
        if not isinstance(other, Span):
            return False
        return all([
            self.type == other.type,
            self.indices == other.indices,
            self.text == other.text,
        ])

    def __hash__(self):
        return hash((
            self.type,
            self.indices,
            self.text,
        ))

[docs]    def to_brat_str(self, output_references=False, seen=None):
        """
        Format this Event instance as a brat string.

        :param bool output_references: If True, also includes the brat string
            of the Spans and Attributes of this Event. Default False.
        """
        if seen is None:
            seen = set()
        if self.id in seen:
            return ''
        seen.add(self.id)
        span_str = f"{self.id}\t{self.type} {str(self.indices)}\t{self.text}"  # noqa
        outlines = [span_str]
        if output_references is True:
            attr_strs = [a.to_brat_str(output_references=False, seen=seen)
                         for a in self.attributes.values()]
            attr_strs = [s for s in attr_strs if s != '']
            outlines.extend(attr_strs)
        brat_str = '\n'.join(outlines)
        return brat_str

[docs]    def asdict(self):
        return {'_id': self.id,
                '_type': self.type,
                'indices': str(self.indices),
                'text': self.text,
                '_source_file': self._source_file}


[docs]class Attribute(Annotation):
    """
    A brat attribute. Can be attached to Spans or Events.

    :param str _id: the unique numerical identifier of this attribute with
                    the 'A' prefix. E.g., 'A5'.
    :param Any value: the value of this attribute.
    :param Annotation reference: the corresponding Span or Event instance.
    :param str _type: (Optional) a string giving the type of this attribute.
                      Default is 'Attribute'.
    :param str _source_file: (Optional), the name of the .ann file which
                             contains this span.
    """
    def __init__(self, _id, value, reference=None,
                 _type="Attribute", _source_file=None):
        super().__init__(_id=_id, _type=_type, _source_file=_source_file)
        assert isinstance(reference, (type(None), Annotation))
        self.value = value
        self.reference = reference
        if not isinstance(self.reference, (Span, Event, type(None))):
            raise ValueError(f"Attribute reference must be instance of Span, Event, or None. Got {type(self.reference)}.")  # noqa
        # Add this attribute to the reference annotation
        if isinstance(self.reference, (Span, Event)):
            self.reference.attributes[self._type] = self

    def __eq__(self, other):
        if not isinstance(other, Attribute):
            return False
        return all([
            self.type == other.type,
            self.value == other.value,
            # Attributes and events can point to each
            # other, so we'll use IDs to avoid endless recursion.
            self.reference.id == other.reference.id,
        ])

    def __hash__(self):
        return hash((
            self.type,
            self.value,
            self.reference.id,
        ))

    @property
    def span(self):
        if self.reference is None:
            span = None
        elif isinstance(self.reference, Span):
            span = self.reference
        elif isinstance(self.reference, Event):
            span = self.reference.spans
        else:
            raise ValueError(f"reference must be Span, Event, or None. Got {type(self.reference)}.")  # noqa
        return span

    @property
    def start_index(self):
        """
        The starting character index of this Attribute's reference.
        """
        return self.span.start_index

    @property
    def end_index(self):
        """
        The ending character index of this Attribute's reference.
        """
        return self.span.end_index

    @property
    def indices(self):
        return self.span.indices

[docs]    def to_brat_str(self, output_references=False, seen=None):
        """
        Format this Attribute instance as a brat string.

        :param bool output_references: If True, also includes the brat string
            of the reference of this Attribute. Default False.
        """
        if seen is None:
            seen = set()
        if self.id in seen:
            return ''
        seen.add(self.id)
        outlines = []
        if output_references is True:
            if self.reference is not None:
                ref_str = self.reference.to_brat_str(output_references=False,
                                                     seen=seen)
                if ref_str != '':
                    outlines.append(ref_str)
        ref_id = self.reference.id
        outlines.append(f"{self.id}\t{self.type} {ref_id} {self.value}")
        return '\n'.join(outlines)

[docs]    def asdict(self):
        return {'_id': self.id,
                '_type': self.type,
                'value': self.value,
                'ref_id': self.reference.id,
                '_source_file': self._source_file}


[docs]class Event(Annotation):
    """
    A brat event, composed of one or more ordered Span instances.
    pybrat does not enforce any specific Event structure.

    :param str _id: the unique numerical identifier of this event
                    with the E prefix. E.g., 'E10'.
    :param Span spans: one or more Span instances.
    :param dict attributes: a dictionary of Attribute instances
                            keyed by attribute type.
    :param str _type: (Optional) A type for this Event. Default is 'Event'.
    :param str _source_file: (Optional), the name of the .ann file which
                             contains this span.
    """
    def __init__(self, _id, *spans, attributes=None,
                 _type="Event", _source_file=None):
        super().__init__(_id=_id, _type=_type, _source_file=_source_file)
        for span in spans:
            assert isinstance(span, (Span, Event)), f"Not a Span instance: {span}"  # noqa
        self.spans = spans
        self.attributes = attributes or {}
        for attr in self.attributes.values():
            attr.reference = self

    def __eq__(self, other):
        if not isinstance(other, Event):
            return False
        return all([
            self.spans == other.spans,
            self.attributes == other.attributes,
        ])

    def __hash__(self):
        return hash((self.type, self.spans))

    @property
    def start_index(self):
        """
        The lowest character index of this Event's spans.
        """
        return min([span.start_index for span in self.spans])

    @property
    def end_index(self):
        """
        The highest character index of this Event's spans.
        """
        return max([span.end_index for span in self.spans])

    @property
    def indices(self):
        return sum([span.indices for span in self.spans], CharacterIndex([]))

[docs]    def to_brat_str(self, output_references=False, seen=None):
        """
        Format this Event instance as a brat string.

        :param bool output_references: If True, also includes the brat string
            of the Spans and Attributes of this Event. Default False.
        """
        if seen is None:
            seen = set()
        if self.id in seen:
            return ''
        seen.add(self.id)
        event_str = f"{self.id}\t"
        for (i, span) in enumerate(self.spans):
            spantype = span.type
            if i == 0:
                spantype = self.type
            event_str += f"{spantype}:{span.id} "
        outlines = [event_str.strip()]
        if output_references is True:
            attr_strs = [a.to_brat_str(output_references=False, seen=seen)
                         for a in self.attributes.values()]
            attr_strs = [s for s in attr_strs if s != '']
            outlines.extend(attr_strs)
            for span in self.spans[::-1]:
                span_str = span.to_brat_str(output_references=True, seen=seen)
                if span_str != '':
                    outlines.insert(0, span_str)
        brat_str = '\n'.join(outlines)
        return brat_str

[docs]    def asdict(self):
        return {'_id': self.id,
                '_type': self.type,
                'ref_spans': [(ref.type, ref.id) for ref in self.spans],
                '_source_file': self._source_file}


[docs]class BratAnnotations(object):
    """
    The main class for working with brat annotations.

    You can read annotations from a file.

    .. code-block:: python

        >>> import pybrat
        >>> anns = pybrat.BratAnnotations.from_file("path/to/file.ann")

    You can also create a set of annotations from Event instances.

    .. code-block:: python

        >>> import pybrat
        >>> event1 = pybrat.Event("E1", *e1spans)
        >>> event2 = pybrat.Event("E2", *e2spans)
        >>> anns = pybrat.BratAnnotations.from_events([event1, event2])
    """

[docs]    @classmethod
    def from_file(cls, fpath):
        """
        Read brat annotations from the specified file.

        :param str fpath: The path to the ann file.
        :returns: a new BratAnnotations instance.
        """
        spans = []
        events = []
        attributes = []
        source_file = fpath
        with open(fpath, 'r') as inF:
            for line in inF:
                line = line.strip()
                ann_type = line[0]
                if ann_type == 'T':
                    data = parse_brat_span(line)
                    data["_source_file"] = fpath
                    spans.append(data)
                elif ann_type == 'E':
                    data = parse_brat_event(line)
                    data["_source_file"] = fpath
                    events.append(data)
                elif ann_type == 'A':
                    data = parse_brat_attribute(line)
                    data["_source_file"] = fpath
                    attributes.append(data)
                else:
                    raise ValueError(f"Unsupported ann_type '{ann_type}'.")
        annotations = cls(spans=spans, events=events, attributes=attributes,
                          _source_file=source_file)
        return annotations

[docs]    @classmethod
    def from_events(cls, events_iter):
        """
        Create a BratAnnotations instance from a collection of Events.
        Assumes that the Event instances in events_iter contain all Spans
        and Attributes.

        :param List[Event] events_iter: An iterable over Event instances.
        """
        annotations = cls(spans=[], events=[], attributes=[])
        annotations._events = list(events_iter)
        for event in annotations.events:
            annotations._attributes.extend(event.attributes.values())
            for span in event.spans:
                annotations._spans.append(span)
                annotations._attributes.extend(span.attributes.values())
        return annotations

    def __init__(self, spans=None, events=None, attributes=None,
                 _source_file=None):
        self._raw_spans = spans or []
        self._raw_events = events or []
        self._raw_attributes = attributes or []
        self._spans = []  # Will hold Span instances
        self._attributes = []  # Will hold Attribute instances
        self._events = []  # Will hold Event instances
        self._source_file = _source_file
        self._resolve()
        self._sorted_spans = None
        self._sorted_attributes = None
        self._sorted_events = None

    def __eq__(self, other):
        if not isinstance(other, BratAnnotations):
            print("diff type")
            return False
        if len(self.spans) != len(other.spans):
            print("diff len spans")
            return False
        for (this_span, other_span) in zip(self.spans, other.spans):
            if this_span != other_span:
                print(f"{this_span} != {other_span}")
                return False
        if len(self.attributes) != len(other.attributes):
            print("diff len attrs")
            return False
        for (this_attr, other_attr) in zip(self.attributes, other.attributes):
            if this_attr != other_attr:
                print(f"{this_attr} != {other_attr}")
                return False
        if len(self.events) != len(other.events):
            print("diff len events")
            return False
        for (this_event, other_event) in zip(self.events, other.events):
            if this_event != other_event:
                print(f"{this_event} != {other_event}")
                return False
        return True

[docs]    def get_events_by_type(self, event_type):
        return [e for e in self.events if e.type == event_type]

[docs]    def get_attributes_by_type(self, attr_type):
        return [a for a in self.attributes if a.type == attr_type]

[docs]    def get_spans_by_type(self, span_type):
        return [s for s in self.spans if s.type == span_type]

    @property
    def spans(self):
        return self._sort_spans_by_index()

    @property
    def attributes(self):
        return self._sort_attributes_by_span_index()

    @property
    def events(self):
        return self._sort_events_by_span_index()

    def __iter__(self):
        for ann in self.get_highest_level_annotations():
            yield ann

    def _sort_spans_by_index(self):
        return sorted(self._spans, key=lambda s: s.start_index)

    def _sort_attributes_by_span_index(self):
        # An Attribute may refer to a Span or an Event,
        # so we have to check which is the case. If its an Event,
        # we have to sort by the Event.span
        span_indices_types = []
        for attr in self._attributes:
            # Use 'A' and 'B' so that spans come first in the sort order, given
            # the same start_index.
            if isinstance(attr.reference, Span):
                span_indices_types.append((attr.reference.start_index, 'A'))
            elif isinstance(attr.reference, Event):
                span_indices_types.append((attr.reference.start_index, 'B'))
        sorted_indices = sorted(enumerate(span_indices_types),
                                key=lambda s: s[1])
        sorted_indices = [i for (i, (idx, _)) in sorted_indices]
        return [self._attributes[i] for i in sorted_indices]

    def _sort_events_by_span_index(self):
        return sorted(self._events, key=lambda e: e.start_index)

    def _resolve(self):
        """
        Given a set of raw spans, attributes, and events, e.g., as read
        from a .ann file, creates Span, Attribute, and Event instances and
        then links them as specified in the file.
        """
        span_lookup = {}
        event_lookup = {}
        attribute_lookup = defaultdict(list)

        for raw_span in self._raw_spans:
            if "_source_file" in raw_span:
                if self._source_file is None:
                    self._source_file = raw_span["_source_file"]
                else:
                    if self._source_file != raw_span["_source_file"]:
                        raise OSError(f"Found conflicting source files! {self._source_file} != {raw_span['source_file']}")  # noqa
            span = Span(**raw_span)
            span_lookup[raw_span["_id"]] = span
            self._spans.append(span)

        for raw_attr in self._raw_attributes:
            if "_source_file" in raw_attr:
                if self._source_file != raw_attr["_source_file"]:
                    raise OSError(f"Found conflicting source files! {self._source_file} != {raw_attr['source_file']}")  # noqa
            ref_id = raw_attr.pop("ref_id")
            ref = span_lookup.get(ref_id, None)
            attribute = Attribute(**raw_attr, reference=ref)
            attribute_lookup[ref_id].append(attribute)
            self._attributes.append(attribute)

        # Events can be nested, and sometimes can be out of order,
        # so we loop over the raw events until they're all accounted for.
        while len(event_lookup) < len(self._raw_events):
            for raw_event in self._raw_events:
                if raw_event["_id"] in event_lookup.keys():
                    continue
                if "_source_file" in raw_event:
                    if self._source_file != raw_event["_source_file"]:
                        raise OSError(f"Found conflicting source files! {self._source_file} != {raw_event['source_file']}")  # noqa
                event_spans = []
                skip_this_event = False
                for (span_type, span_id) in raw_event["ref_spans"]:
                    try:
                        span = span_lookup[span_id]
                    except KeyError:
                        try:
                            span = event_lookup[span_id]
                        except KeyError:
                            skip_this_event = True
                            break
                    event_spans.append(span)
                if skip_this_event is True:
                    continue

                event = Event(raw_event["_id"], _type=raw_event["_type"],
                              *event_spans, attributes=None,
                              _source_file=raw_event["_source_file"])
                event_lookup[raw_event["_id"]] = event
                attrs = attribute_lookup[raw_event["_id"]]
                for attr in attrs:
                    attr.reference = event
                attrs_by_type = {attr.type: attr for attr in attrs}
                event.attributes = attrs_by_type
                self._events.append(event)

[docs]    def get_highest_level_annotations(self, type=None):
        """
        brat annotations can include only spans, spans + events,
        or spans + events + attributes. This method allows one to
        get the highest-level annotation available in this file.

        In order from highest to lowest level:
          Event
          Attribute
          Span

        :param str type: (Optional) return annotations with the specified type.
        """
        if len(self._events) > 0:
            if type is not None:
                return self.get_events_by_type(type)
            else:
                return self.events
        elif len(self._spans) > 0:
            if type is not None:
                return self.get_spans_by_type(type)
            else:
                return self.spans
        elif len(self._attributes) > 0:
            if type is not None:
                return self.get_attributes_by_type(type)
            else:
                return self.attributes
        else:
            return []

    def __str__(self):
        seen_spans = set()
        seen_attrs = set()
        brat_str = ''
        seen = set()
        for event in self.events:
            line = event.to_brat_str(output_references=True, seen=seen)
            if line != '':
                brat_str += line + '\n'
            seen_spans.update(event.spans)
            seen_attrs.update(event.attributes.values())
        for span in self.spans:
            if span not in seen_spans:
                line = span.to_brat_str(output_references=True, seen=seen)
                if line != '':
                    brat_str += line + '\n'
                seen_attrs.update(span.attributes.values())
        for attr in self.attributes:
            if attr not in seen_attrs:
                line = attr.to_brat_str(output_references=False, seen=seen)
                if line != '':
                    brat_str += line + '\n'
        return brat_str.strip()

[docs]    def add_annotation(self, annotation: Annotation):
        ann_cls = annotation.__class__.__name__
        references = []

        if ann_cls == "Span":
            annlist = self._spans
            prefix = 'T'
        elif ann_cls == "Event":
            annlist = self._events
            prefix = 'E'
            references = annotation.spans
        elif ann_cls == "Attribute":
            annlist = self._attributes
            prefix = 'A'
            references = [annotation.reference]
        else:
            raise ValueError(f"Unsupported Annotation type '{ann_cls}'")
        if annotation in annlist:
            return
        seen_ids = set([ann.id for ann in annlist])
        if annotation.id in seen_ids:
            max_id = max([int(annid.strip(prefix)) for annid in seen_ids])
            annotation._id = f"{prefix}{max_id + 1}"
        annlist.append(annotation)
        for ref in references:
            self.add_annotation(ref)

[docs]    def save_brat(self, outdir, filename=None):
        """
        Save these brat annotations to a brat-formatted file.

        :param str outdir: The directory in which to save the file.
        :param str filename: (Optional) The filename to use. If not specified,
                             attempts to use the Annotation._source_file.
        """
        if filename is None and self._source_file is None:
            raise ValueError("No filename specified.")
        if filename is not None:
            outfile = os.path.join(outdir, filename)
        else:
            bn = os.path.basename(self._source_file)
            outfile = os.path.join(outdir, bn)
        brat_str = str(self)
        with open(outfile, 'a') as outF:
            outF.write(brat_str + '\n')


[docs]class BratText(object):
    """
    A simple class for organizing the text that corresponds to a
    file of brat annotations.

    Specify plain text, split sentences, or both.

    .. code-block:: python

        >>> bt = BratText(text=plain_text, sentences=list_of_sents)
        >>> bt.text(0, 12)  # Plain text at character indices 0 through 12
        >>> bt.tokens(0, 12)  # Tokens spanning character indices 0 through 12
        >>> bt.sentences(0, 12)  # Sentences spanning character indices 0 - 12

    `sentences` can also be a json lines file with the following format:

    .. code-block:: bash

        {"sent_index": int  # the number of this sentence in the document
         "start_char": int  # the character offset of the start of the sentence
         "end_char": int    # the character offset of the end of the sentence
         "_text":           # the sentence text
        }

    You can also access the text using Annotation instances

    .. code-block:: python

        >>> anns = BratAnnotations.from_file("path/to/file1.ann")
        >>> bt = BratText.from_files(text="path/to/file1.txt",
        ...                          sentences="path/to/file1.jsonl")
        >>> # get the text of the first span
        >>> bt.text(annotations=[anns.spans[0]])
        >>> # tokens from the first three spans
        >>> bt.tokens(annotations=anns.spans[0:3])
        >>> # Sentences containing all events
        >>> bt.sentences(annotations=anns.events[:])
    """
[docs]    @classmethod
    def from_files(cls, text=None, sentences=None, tokenizer=None):
        if text is None and sentences is None:
            raise ValueError("Must specify at least one of text, sentences")
        if text is not None:
            text = open(text, 'r').read()
        if sentences is not None:
            try:
                sentences = [json.loads(line) for line in open(sentences, 'r')]
            except json.JSONDecodeError:
                sentences = open(sentences, 'r').readlines()
        return cls(text=text, sentences=sentences, tokenizer=tokenizer)

    def __init__(self, text=None, sentences=None, tokenizer=None):
        if text is None and sentences is None:
            raise ValueError("Must supply at least one of text or sentences")
        self.is_split_into_sentences = sentences is not None
        if sentences is not None:
            self._sentences_lookup = self._split_sentences(sentences)
        if text is None:
            self._text = self._get_text_from_sentences()
        else:
            self._text = text
        self.tokenizer = self._get_tokenizer(tokenizer)
        self._tokens = None
        self._tokens_lookup = self._tokenize(self._text)

    def __str__(self):
        return self._text

[docs]    def text(self, annotations: List[Annotation] = [],
             start_char: int = None, end_char: int = None):
        assert isinstance(start_char, (type(None), int))
        assert isinstance(end_char, (type(None), int))
        if not isinstance(annotations, list):
            annotations = [annotations]
        assert all([isinstance(ann, Annotation) for ann in annotations])
        if len(annotations) > 0:
            if start_char is not None or end_char is not None:
                warnings.warn("Ignoring {start,end}_char since Annotation was provided.")  # noqa
            start_char = min([ann.start_index for ann in annotations])
            end_char = max([ann.end_index for ann in annotations])
        if end_char is None:
            if start_char is None:
                end_char = len(self._text)
            else:
                end_char = start_char + 1
        if start_char is None:
            start_char = 0
        return self._text[start_char:end_char]

[docs]    def tokens(self, annotations: List[Annotation] = [],
               start_char: int = None, end_char: int = None):
        assert isinstance(start_char, (type(None), int))
        assert isinstance(end_char, (type(None), int))
        if not isinstance(annotations, list):
            annotations = [annotations]
        assert all([isinstance(ann, Annotation) for ann in annotations])
        if len(annotations) > 0:
            if start_char is not None or end_char is not None:
                warnings.warn("Ignoring {start,end}_char since Annotation was provided.")  # noqa
            char_idxs = sum([ann.indices for ann in annotations],
                            CharacterIndex([]))

        else:
            if end_char is None:
                if start_char is None:
                    end_char = len(self._text)
                else:
                    end_char = start_char + 1
            if start_char is None:
                start_char = 0
            char_idxs = range(start_char, end_char)

        tokens = []
        for char_i in char_idxs:
            try:
                t = self._tokens_lookup[char_i]
            except KeyError:
                continue
            if len(tokens) == 0 or t != tokens[-1]:
                tokens.append(t)
        return tokens

[docs]    def sentences(self, annotations: List[Annotation] = [],
                  start_char: int = None, end_char: int = None):
        if self.is_split_into_sentences is False:
            raise ValueError("Text is not split into sentences.")
        assert isinstance(start_char, (type(None), int))
        assert isinstance(end_char, (type(None), int))
        if not isinstance(annotations, list):
            annotations = [annotations]
        assert all([isinstance(ann, Annotation) for ann in annotations])
        if len(annotations) > 0:
            if start_char is not None or end_char is not None:
                warnings.warn("Ignoring {start,end}_char since Annotation was provided.")  # noqa
            char_idxs = sum([ann.indices for ann in annotations],
                            CharacterIndex([]))
        else:
            if end_char is None:
                if start_char is None:
                    end_char = max(list(self._sentences_lookup.keys()))
                else:
                    end_char = start_char + 1
            if start_char is None:
                start_char = min(list(self._sentences_lookup.keys()))
            char_idxs = range(start_char, end_char)

        sents = []
        for char_i in char_idxs:
            try:
                s = self._sentences_lookup[char_i]
            except KeyError:
                continue
            if s not in sents:
                sents.append(s)
        return sents

[docs]    def save(self, outdir, filename=None):
        """
        Save this BratText instance to a plain text file.

        :param str outdir: The directory in which to save the file.
        :param str filename: (Optional) The filename to use. If not specified,
                             attempts to use the Annotation._source_file.
        """
        if filename is None and self._source_file is None:
            raise ValueError("No filename specified.")
        if filename is not None:
            outfile = os.path.join(outdir, filename)
        else:
            bn = os.path.basename(self._source_file)
            outfile = os.path.join(outdir, bn)
        brat_str = str(self)
        with open(outfile, 'a') as outF:
            outF.write(brat_str + '\n')

    def _get_text_from_sentences(self):
        text = ''
        for sent in self.sentences():
            text += ' ' * (sent["start_char"] - len(text))
            text += sent["_text"]
        return text

    def _split_sentences(self, sentences):
        assert isinstance(sentences, list)
        is_str = all([isinstance(sent, str) for sent in sentences])
        is_dict = all([isinstance(sent, dict) for sent in sentences])
        assert is_str or is_dict

        sentence_lookup = {}
        if is_str:
            # assume sentences passed in order with no character offsets
            current_start = 0
            for (i, sent) in enumerate(sentences):
                current_end = current_start + len(sent)
                sent_data = {"sent_index": i,
                             "start_char": current_start,
                             "end_char": current_end,
                             "_text": sent}
                for char_idx in range(current_start, current_end):
                    sentence_lookup[char_idx] = sent_data
                current_start = current_end
        elif is_dict:
            seen_indices = set()
            for sent in sentences:
                assert "sent_index" in sent
                assert "start_char" in sent
                assert "end_char" in sent
                assert "_text" in sent
                assert sent["sent_index"] not in seen_indices, "Duplicate sent_index!"  # noqa
                seen_indices.add(sent["sent_index"])
                for char_idx in range(sent["start_char"], sent["end_char"]):
                    sentence_lookup[char_idx] = sent
        return sentence_lookup

    def _get_tokenizer(self, tokenizer):
        if tokenizer is None:
            tokenizer = RegexTokenizer()
        return tokenizer

    def _tokenize(self, text):
        tokens, char_ranges = self.tokenizer(text)
        token_lookup = {}
        for (tok, crange) in zip(tokens, char_ranges):
            for ci in range(*crange):
                token_lookup[ci] = tok
        return token_lookup


[docs]class RegexTokenizer(object):
    """
    A very simple tokenizer that splits on whitespace by default.

    .. code-block:: python

        >>> import pybrat
        >>> tokenizer = pybrat.RegexTokenizer()
        >>> text = "The cat in the hat"
        >>> tokens, token_char_ranges = tokenizer(text)
    """
    def __init__(self, split_pattern=r'\s'):
        self.split_pattern = re.compile(split_pattern)

    def __call__(self, text: str):
        tokens = []
        token_char_idxs = []
        current_text = ''
        current_char_idxs = []
        for (i, char) in enumerate(text):
            if self.split_pattern.match(char):
                if len(current_text) > 0:
                    tokens.append(current_text)
                    token_char_idxs.append(current_char_idxs)
                current_text = ''
                current_char_idxs = []
            else:
                current_text += char
                current_char_idxs.append(i)
        if len(current_text) > 0:
            tokens.append(current_text)
            token_char_idxs.append(current_char_idxs)
        token_ranges = [(char_idxs[0], char_idxs[-1])
                        for char_idxs in token_char_idxs]
        return tokens, token_ranges


[docs]def parse_brat_span(line):
    # Sometimes things like '&quot;' appear
    line = html.unescape(line)
    uid, label, other = line.split(maxsplit=2)
    # start1 end1;start2 end2
    tmp = other.split('\t')
    if len(tmp) == 1:
        spans = tmp[0]
        text = ''
    else:
        spans, text = tmp
    if ';' in spans:
        spans = [s.split() for s in spans.split(';')]
        spans = [(int(s), int(e)) for (s, e) in spans]
        indices = CharacterIndex(spans)
    else:
        s, e = spans.split()
        indices = CharacterIndex([(int(s), int(e))])

    return {"_id": uid,
            "_type": label,
            "indices": indices,
            "text": text}


[docs]def parse_brat_event(line):
    fields = line.split('\t')
    if len(fields) > 2:
        # Sometimes we get attributes appended to the end
        # E0\tSubject:T0 Object:T1\tSource:T001
        # Ignore with warning for now
        warnings.warn(f"Ignoring extra data {fields[2:]} for event {fields[0]}")  # noqa
        fields = fields[:2]
    uid = fields[0]
    spans_str = fields[1]
    spans = spans_str.split()
    # There should be at least one span
    assert len(spans) >= 1
    ref_spans = []
    event_label = None
    for (i, span) in enumerate(spans):
        label, ref = span.split(':')
        ref_spans.append((label, ref))
        if i == 0:
            event_label = label
    return {"_id": uid,
            "_type": event_label,
            "ref_spans": ref_spans}


[docs]def parse_brat_attribute(line):
    fields = line.split()
    if fields[1] == "Negation":
        if len(fields) == 3:
            fields.append("Negated")
    assert len(fields) == 4
    uid, label, ref, value = fields
    return {"_id": uid,
            "_type": label,
            "value": value,
            "ref_id": ref}
Source code for pybrat

pybrat

Navigation

Related Topics