Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 16 additions & 11 deletions cassis/cas.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,12 @@
TYPE_NAME_FS_LIST,
TYPE_NAME_SOFA,
FeatureStructure,
Annotation,
Type,
TypeCheckError,
TypeSystem,
TypeSystemMode,
is_annotation,
)

_validator_optional_string = validators.optional(validators.instance_of(str))
Expand Down Expand Up @@ -172,13 +174,14 @@ def type_index(self) -> Dict[str, SortedKeyList]:
return self._indices

def add_annotation_to_index(self, annotation: FeatureStructure):
"""Adds a feature structure to the type index for this view."""
self._indices[annotation.type.name].add(annotation)

def get_all_annotations(self) -> List[FeatureStructure]:
"""Gets all the annotations in this view.
"""Gets all the FeatureStructure in this view.

Returns:
A list of all annotations in this view.
A list of all FeatureStructure in this view.

"""
result = []
Expand Down Expand Up @@ -335,6 +338,8 @@ def add(self, annotation: FeatureStructure, keep_id: Optional[bool] = True):
if hasattr(annotation, "sofa"):
annotation.sofa = self.get_sofa()

# Add to the index. The view index accepts any FeatureStructure;
# `_sort_func` will duck-type annotation-like objects when sorting.
self._current_view.add_annotation_to_index(annotation)
Comment thread
reckart marked this conversation as resolved.
Outdated

@deprecation.deprecated(details="Use add()")
Expand Down Expand Up @@ -492,7 +497,7 @@ def remove_annotations_in_range(self, begin: int, end: int, type_: Optional[Unio
raise ValueError(f"Invalid indices for begin {begin} and end {end}")

@deprecation.deprecated(details="Use annotation.get_covered_text()")
def get_covered_text(self, annotation: FeatureStructure) -> str:
def get_covered_text(self, annotation: Annotation) -> str:
"""Gets the text that is covered by `annotation`.

Args:
Expand All @@ -518,7 +523,7 @@ def select(self, type_: Union[Type, str]) -> List[FeatureStructure]:
t = type_ if isinstance(type_, Type) else self.typesystem.get_type(type_)
return self._get_feature_structures(t)

def select_covered(self, type_: Union[Type, str], covering_annotation: FeatureStructure) -> List[FeatureStructure]:
def select_covered(self, type_: Union[Type, str], covering_annotation: Annotation) -> List[Annotation]:
"""Returns a list of covered annotations.

Return all annotations that are covered
Expand All @@ -544,7 +549,7 @@ def select_covered(self, type_: Union[Type, str], covering_annotation: FeatureSt
result.append(annotation)
return result

def select_covering(self, type_: Union[Type, str], covered_annotation: FeatureStructure) -> List[FeatureStructure]:
def select_covering(self, type_: Union[Type, str], covered_annotation: Annotation) -> List[FeatureStructure]:
"""Returns a list of annotations that cover the given annotation.

Return all annotations that are covering. This can be potentially be slow.
Expand All @@ -570,7 +575,7 @@ def select_covering(self, type_: Union[Type, str], covered_annotation: FeatureSt
if c_begin >= annotation.begin and c_end <= annotation.end:
yield annotation

def select_all(self) -> List[FeatureStructure]:
def select_all(self) -> List[Annotation]:
Comment thread
reckart marked this conversation as resolved.
Outdated
"""Finds all feature structures in this Cas

Returns:
Expand Down Expand Up @@ -939,8 +944,8 @@ def _copy(self) -> "Cas":


def _sort_func(a: FeatureStructure) -> Tuple[int, int, int]:
d = a.__slots__
if "begin" in d and "end" in d:
return a.begin, a.end, id(a)
else:
return sys.maxsize, sys.maxsize, id(a)
if is_annotation(a):
return a.begin, a.end, a.xmiID if getattr(a, "xmiID", None) is not None else id(a)

# Non-annotation feature structures are sorted after annotations using large sentinels
return sys.maxsize, sys.maxsize, a.xmiID if getattr(a, "xmiID", None) is not None else id(a)
54 changes: 50 additions & 4 deletions cassis/typesystem.py
Original file line number Diff line number Diff line change
Expand Up @@ -500,6 +500,23 @@ def __repr__(self):
return str(self)


@attr.s(slots=True, hash=False, eq=True, order=True, repr=False)
class Annotation(FeatureStructure):
"""Concrete base class for annotation instances.

Generated types that represent (subtypes of) `uima.tcas.Annotation` will
Comment thread
reckart marked this conversation as resolved.
Outdated
inherit from this class so that static typing can rely on a nominal base
providing `begin` and `end`.
"""

begin: int = attr.ib(default=0)
end: int = attr.ib(default=0)


def is_annotation(fs: FeatureStructure) -> bool:
return hasattr(fs, "begin") and isinstance(fs.begin, int) and hasattr(fs, "end") and isinstance(fs.end, int)


@attr.s(slots=True, eq=False, order=False, repr=False)
class Feature:
"""A feature defines one attribute of a feature structure"""
Expand Down Expand Up @@ -572,15 +589,44 @@ class Type:
def __attrs_post_init__(self):
"""Build the constructor that can create feature structures of this type"""
name = _string_to_valid_classname(self.name)
fields = {feature.name: attr.ib(default=None, repr=(feature.name != "sofa")) for feature in self.all_features}

# Determine whether this type is (transitively) a subtype of uima.tcas.Annotation
def _is_annotation_type(t: "Type") -> bool:
cur = t
while cur is not None:
if cur.name == TYPE_NAME_ANNOTATION:
return True
cur = cur.supertype
return False

# When inheriting from our concrete Annotation base, do not redeclare
# the 'begin' and 'end' features as fields; they are already present.
fields = {}
for feature in self.all_features:
if feature.name in {"begin", "end"} and _is_annotation_type(self):
# skip - Annotation base provides these
continue
Comment thread
reckart marked this conversation as resolved.
fields[feature.name] = attr.ib(default=None, repr=(feature.name != "sofa"))
fields["type"] = attr.ib(default=self)

# We assign this to a lambda to make it lazy
# When creating large type systems, almost no types are used so
# creating them on the fly is on average better
self._constructor_fn = lambda: attr.make_class(
name, fields, bases=(FeatureStructure,), slots=True, eq=False, order=False
)
bases = (Annotation,) if _is_annotation_type(self) else (FeatureStructure,)

def _make_fs_class():
cls = attr.make_class(name, fields, bases=bases, slots=True, eq=False, order=False)
# Ensure generated FS classes are hashable. When a class defines an
# __eq__ (inherited or generated) but no __hash__, Python makes
# instances unhashable. We want FeatureStructure-based instances to
# be usable as dict/set keys (they are keyed by xmiID), so assign the
# base FeatureStructure.__hash__ implementation to the generated
# class if it doesn't already provide one.
if getattr(cls, "__hash__", None) is None:
cls.__hash__ = FeatureStructure.__hash__
return cls

self._constructor_fn = _make_fs_class

def __call__(self, **kwargs) -> FeatureStructure:
"""Creates an feature structure of this type
Expand Down
19 changes: 9 additions & 10 deletions cassis/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
TYPE_NAME_STRING_ARRAY,
FeatureStructure,
Type,
is_annotation,
is_array,
is_list,
)
Expand Down Expand Up @@ -205,7 +206,7 @@ def _render_feature_structure(
if indexed_column:
row_data.append(_bool_to_java_string(id(fs) in indexed_feature_structure_ids))

if max_covered_text > 0 and _is_annotation_fs(fs):
if max_covered_text > 0 and is_annotation(fs):
covered_text_value = _abbreviate_middle(fs.get_covered_text(), "...", max_covered_text)
row_data.append(_escape(_render_string_value(covered_text_value, treat_empty_strings_as_null, null_value)))

Expand Down Expand Up @@ -354,7 +355,7 @@ def _generate_anchor(
) -> str:
anchor = fs.type.name.rsplit(".", 2)[-1] # Get the short type name (no package)

if include_offsets and _is_annotation_fs(fs):
if include_offsets and is_annotation(fs):
anchor += f"[{fs.begin}-{fs.end}]"

if add_index_mark:
Expand All @@ -381,10 +382,6 @@ def _is_multi_valued_feature_structure(fs: Any) -> bool:
return isinstance(fs, FeatureStructure) and (is_array(fs.type) or is_list(fs.type))


def _is_annotation_fs(fs: FeatureStructure) -> bool:
return hasattr(fs, "begin") and isinstance(fs.begin, int) and hasattr(fs, "end") and isinstance(fs.end, int)


def _compare_fs(
type_: Type,
a: FeatureStructure,
Expand All @@ -396,8 +393,8 @@ def _compare_fs(
return 0

# duck-typing check if something is a annotation - if yes, try sorting by offets
Comment thread
reckart marked this conversation as resolved.
Outdated
fs_a_is_annotation = _is_annotation_fs(a)
fs_b_is_annotation = _is_annotation_fs(b)
fs_a_is_annotation = is_annotation(a)
fs_b_is_annotation = is_annotation(b)
if fs_a_is_annotation != fs_b_is_annotation:
return -1
Comment thread
reckart marked this conversation as resolved.
Outdated
if fs_a_is_annotation and fs_b_is_annotation:
Expand Down Expand Up @@ -536,7 +533,9 @@ def _escape(value: str) -> str:
return value.translate(_ESCAPE_TRANSLATION)


def _abbreviate_middle(value: str, middle: str, max_length: int) -> str:
def _abbreviate_middle(value: Optional[str], middle: str, max_length: int) -> Optional[str]:
if value is None:
return None
if len(value) <= max_length:
return value

Expand Down Expand Up @@ -584,7 +583,7 @@ def _render_multi_valued_feature_structure(
if values is None:
return null_value

if sort_annotations_in_multi_valued_features and all(_is_annotation_fs(value) for value in values):
if sort_annotations_in_multi_valued_features and all(is_annotation(value) for value in values):
values = sorted(values, key=lambda value: (value.begin, -value.end, value.type.name))

return _render_sequence(
Expand Down
18 changes: 12 additions & 6 deletions cassis/xmi.py
Original file line number Diff line number Diff line change
Expand Up @@ -619,13 +619,19 @@ def _serialize_feature_structure(self, cas: Cas, root: etree.Element, fs: Featur
continue

# Map back from offsets in Unicode codepoints to UIMA UTF-16 based offsets
if (
ts.is_instance_of(fs.type.name, TYPE_NAME_ANNOTATION)
and feature_name == FEATURE_BASE_NAME_BEGIN
or feature_name == FEATURE_BASE_NAME_END
# Ensure we only convert begin/end for annotation instances. Parentheses are
# required because `and` has higher precedence than `or` and we must not
# attempt conversion for the END feature on non-annotations.
if ts.is_instance_of(fs.type.name, TYPE_NAME_ANNOTATION) and (
feature_name == FEATURE_BASE_NAME_BEGIN or feature_name == FEATURE_BASE_NAME_END
):
sofa: Sofa = fs.sofa
value = sofa._offset_converter.python_to_external(value)
# Be defensive: only perform offset conversion if the sofa and its
# offset converter have been initialized. In some workflows (e.g. a
# freshly constructed CAS without sofa strings) the converter may
# not exist yet and conversion is not possible.
sofa = getattr(fs, "sofa", None)
if sofa is not None and getattr(sofa, "_offset_converter", None) is not None:
value = sofa._offset_converter.python_to_external(value)

if ts.is_instance_of(feature.rangeType, TYPE_NAME_STRING_ARRAY) and not feature.multipleReferencesAllowed:
if value.elements is not None: # Compare to none as not to skip if elements is empty!
Expand Down
33 changes: 33 additions & 0 deletions tests/test_cas.py
Original file line number Diff line number Diff line change
Expand Up @@ -600,6 +600,25 @@ def test_covered_text_on_non_annotation():
top.get_covered_text()


def test_add_non_annotation_and_select():
"""Create a non-annotation type, add an instance and verify select returns it."""
cas = Cas()

# Create a type that does not define annotation offsets (begin/end)
NonAnnotation = cas.typesystem.create_type("test.NonAnnotation")

# Instantiate and add to CAS
fs = NonAnnotation()
cas.add(fs)

# Should be retrievable by select using the type name
selected = list(cas.select("test.NonAnnotation"))
assert selected == [fs]

# And visible via select_all
assert fs in cas.select_all()


def test_covered_text_on_annotation_without_sofa():
cas = Cas()
Annotation = cas.typesystem.get_type(TYPE_NAME_ANNOTATION)
Expand All @@ -609,6 +628,20 @@ def test_covered_text_on_annotation_without_sofa():
ann.get_covered_text()


def test_runtime_generated_annotation_is_detected_and_shown_in_anchor():
ts = TypeSystem()
# Create a new annotation subtype (should inherit from Annotation base)
MyAnno = ts.create_type("my.pkg.MyAnnotation", supertypeName="uima.tcas.Annotation")

cas = Cas(ts)
# Create an instance of the runtime-generated type; ensure we can set begin/end
a = MyAnno(begin=5, end=10)
cas.add(a)

text = cas_to_comparable_text(cas)
assert "MyAnnotation[5-10]" in text


def test_remove_annotations_in_range(small_typesystem_xml, small_xmi):
typesystem = load_typesystem(small_typesystem_xml)
cas = load_cas_from_xmi(small_xmi, typesystem)
Expand Down
6 changes: 3 additions & 3 deletions tests/test_files/xmi/cas_with_collections.xmi
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
<elements>C</elements>
</cas:StringArray>

<cassis:Group xmi:id="5" sofa="1" integers="1 2 3" shorts="1 2 3" longs="1 2 3" floats="1.0 2.0 3.0" doubles="1.0 2.0 3.0" bytes="42DB3064" booleans="true false" fses="0 6 7">
<cassis:Group xmi:id="5" sofa="1" begin="0" end="0" integers="1 2 3" shorts="1 2 3" longs="1 2 3" floats="1.0 2.0 3.0" doubles="1.0 2.0 3.0" bytes="42DB3064" booleans="true false" fses="0 6 7">
<strings>A</strings>
<strings>B</strings>
<strings>C</strings>
Expand Down Expand Up @@ -48,9 +48,9 @@
<cas:LongArray xmi:id="17" elements="2516571677013944794"/>
<cas:DoubleArray xmi:id="18" elements="0.4362829094329638 0.6487936445670887 0.6959691863162578"/>

<cassis:Group xmi:id="19" sofa="1" integers="" shorts="" longs="" floats="" doubles="" bytes="" booleans="" fses=""/>
<cassis:Group xmi:id="19" sofa="1" begin="0" end="0" integers="" shorts="" longs="" floats="" doubles="" bytes="" booleans="" fses=""/>

<cassis:Group xmi:id="20" sofa="1"/>
<cassis:Group xmi:id="20" sofa="1" begin="0" end="0"/>

<cas:FSArray xmi:id="21" elements="" />

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

<tcas:DocumentAnnotation xmi:id="1" sofa="1" begin="0" end="47" language="x-unspecified"/>

<test:type xmi:id="2" sofa="1" target="3"/>
<test:type xmi:id="2" sofa="1" target="3" begin="0" end="0"/>

<cas:StringArray xmi:id="3">
<elements>LNC</elements>
Expand Down
2 changes: 1 addition & 1 deletion tests/test_files/xmi/cas_with_reserved_names.xmi
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
xmlns:test="http:///test.ecore" xmi:version="2.0">
<cas:NULL xmi:id="0"/>
<tcas:DocumentAnnotation xmi:id="2" sofa="1" begin="0" end="47" language="x-unspecified"/>
<test:type xmi:id="3" type="2" self="2" sofa="1"/>
<test:type xmi:id="3" type="2" self="2" sofa="1" begin="0" end="0"/>
<cas:Sofa xmi:id="1" sofaNum="1" sofaID="_InitialView" mimeType="text/plain"
sofaString="Joe waited for the train . The train was late ."/>
<cas:View sofa="1" members="2 3"/>
Expand Down
Loading