From fd013423e5e917eceb485c1643b98c1c86456629 Mon Sep 17 00:00:00 2001 From: Pierre Camilleri Date: Wed, 16 Jul 2025 13:55:11 +0200 Subject: [PATCH 1/5] BooleanField independant of Metadata --- frictionless/detector/detector.py | 7 +- frictionless/fields/boolean.py | 30 +- frictionless/schema/field.py | 50 +++- frictionless/schema/field_constraints.py | 33 +++ frictionless/schema/field_descriptor.py | 363 +++++++++++++++++++++++ 5 files changed, 445 insertions(+), 38 deletions(-) create mode 100644 frictionless/schema/field_constraints.py create mode 100644 frictionless/schema/field_descriptor.py diff --git a/frictionless/detector/detector.py b/frictionless/detector/detector.py index fc3e47e0a1..6ff0896527 100644 --- a/frictionless/detector/detector.py +++ b/frictionless/detector/detector.py @@ -329,7 +329,7 @@ def detect_schema( # Handle name/empty for index, name in enumerate(names): - names[index] = name or f"field{index+1}" + names[index] = name or f"field{index + 1}" # Deduplicate names if len(names) != len(set(names)): @@ -360,10 +360,11 @@ def detect_schema( field.float_number = True # type: ignore elif field.type == "boolean": if self.field_true_values != settings.DEFAULT_TRUE_VALUES: - field.true_values = self.field_true_values # type: ignore + field._descriptor.true_values = self.field_true_values # type: ignore if self.field_false_values != settings.DEFAULT_FALSE_VALUES: - field.false_values = self.field_false_values # type: ignore + field._descriptor.false_values = self.field_false_values # type: ignore runner_fields.append(field) + for index, name in enumerate(names): runners.append([]) for field in runner_fields: diff --git a/frictionless/fields/boolean.py b/frictionless/fields/boolean.py index 6d78984fe1..518d6c9c44 100644 --- a/frictionless/fields/boolean.py +++ b/frictionless/fields/boolean.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import Any, Dict, List +from typing import List import attrs @@ -29,34 +29,6 @@ class BooleanField(Field): true values are ["false", "False", "FALSE", "0"]. """ - # Read - - def create_value_reader(self): - # Create mapping - mapping: Dict[str, bool] = {} - for value in self.true_values: - mapping[value] = True - for value in self.false_values: - mapping[value] = False - - # Create reader - def value_reader(cell: Any): - if cell is True or cell is False: - return cell - if isinstance(cell, str): - return mapping.get(cell) - - return value_reader - - # Write - - def create_value_writer(self): - # Create writer - def value_writer(cell: Any): - return self.true_values[0] if cell else self.false_values[0] - - return value_writer - # Metadata metadata_profile_patch = { diff --git a/frictionless/schema/field.py b/frictionless/schema/field.py index 5b9f8c3eb5..f5499cfb14 100644 --- a/frictionless/schema/field.py +++ b/frictionless/schema/field.py @@ -1,16 +1,19 @@ from __future__ import annotations +import copy import decimal import re from functools import partial from typing import TYPE_CHECKING, Any, Callable, ClassVar, Dict, List, Optional, Pattern import attrs +import pydantic from .. import errors, settings from ..exception import FrictionlessException from ..metadata import Metadata from ..system import system +from .field_descriptor import BooleanFieldDescriptor, FieldDescriptor if TYPE_CHECKING: from ..types import IDescriptor @@ -22,6 +25,8 @@ class Field(Metadata): """Field representation""" + _descriptor: Optional[FieldDescriptor] = None + name: str """ A short url-usable (and preferably human-readable) name. @@ -50,9 +55,7 @@ class Field(Metadata): For example: "default","array" etc. """ - missing_values: List[str] = attrs.field( - factory=settings.DEFAULT_MISSING_VALUES.copy - ) + missing_values: List[str] = attrs.field(factory=settings.DEFAULT_MISSING_VALUES.copy) """ List of string values to be set as missing values in the field. If any of string in missing values is found in the field value then it is set as None. @@ -154,6 +157,8 @@ def cell_reader(cell: Any): def create_value_reader(self) -> types.IValueReader: # Create reader def value_reader(cell: Any): + if self._descriptor and isinstance(self._descriptor, BooleanFieldDescriptor): + return self._descriptor.read_value(cell) return cell return value_reader @@ -192,6 +197,8 @@ def cell_writer(cell: Any, *, ignore_missing: bool = False): def create_value_writer(self) -> types.IValueWriter: # Create writer def value_writer(cell: Any): + if self._descriptor and isinstance(self._descriptor, BooleanFieldDescriptor): + return self._descriptor.write_value(cell) return str(cell) return value_writer @@ -244,6 +251,39 @@ def metadata_transform(cls, descriptor: IDescriptor): if format and isinstance(format, str) and format.startswith("fmt:"): descriptor["format"] = format.replace("fmt:", "") + @classmethod + def metadata_import( + cls, + descriptor: IDescriptor, + *, + with_basepath: bool = False, + **options: Any, + ) -> "Field": + descriptor_copy = copy.deepcopy(descriptor) + field = super().metadata_import( + descriptor, + with_basepath=with_basepath, + ) + + if field.type == "boolean": + try: + field._descriptor = BooleanFieldDescriptor.model_validate(descriptor_copy) + except pydantic.ValidationError as ve: + error = errors.SchemaError(note=str(ve)) + raise FrictionlessException(error) + + return field + + def to_descriptor(self, *, validate: bool = False) -> IDescriptor: + if self._descriptor and isinstance(self._descriptor, BooleanFieldDescriptor): + descr = self._descriptor.model_dump(exclude_none=True, exclude_unset=True) + ## Temporarily, Field properties have priority over + ## Field._descriptor properties + descr = {**descr, **super().to_descriptor(validate=validate)} + return descr + else: + return super().to_descriptor(validate=validate) + @classmethod def metadata_validate(cls, descriptor: IDescriptor): # type: ignore metadata_errors = list(super().metadata_validate(descriptor)) @@ -276,9 +316,7 @@ def metadata_validate(cls, descriptor: IDescriptor): # type: ignore field.false_values = descriptor["falseValues"] _, notes = field.read_cell(example) if notes is not None: - note = ( - f'example value "{example}" for field "{field.name}" is not valid' - ) + note = f'example value "{example}" for field "{field.name}" is not valid' yield errors.FieldError(note=note) # Misleading diff --git a/frictionless/schema/field_constraints.py b/frictionless/schema/field_constraints.py new file mode 100644 index 0000000000..9323714d0f --- /dev/null +++ b/frictionless/schema/field_constraints.py @@ -0,0 +1,33 @@ +"""field_constraints.py provide pydantic Models for constraints""" + +from typing import Any, Dict, Generic, List, Optional, TypeVar, Union + +import pydantic + +T = TypeVar("T") + + +class BaseConstraints(pydantic.BaseModel, Generic[T]): + required: Optional[bool] = None + unique: Optional[bool] = None + enum: Optional[List[Union[str, T]]] = None + + +class CollectionConstraints(BaseConstraints[str]): + minLength: Optional[int] = None + maxLength: Optional[int] = None + + +class JSONConstraints(CollectionConstraints): + jsonSchema: Optional[Dict[str, Any]] = None + + +class StringConstraints(CollectionConstraints): + pattern: Optional[str] = None + + +class ValueConstraints(BaseConstraints[T], Generic[T]): + minimum: Optional[Union[str, T]] = None + maximum: Optional[Union[str, T]] = None + exclusiveMinimum: Optional[Union[str, T]] = None + exclusiveMaximum: Optional[Union[str, T]] = None diff --git a/frictionless/schema/field_descriptor.py b/frictionless/schema/field_descriptor.py new file mode 100644 index 0000000000..d3e05d0881 --- /dev/null +++ b/frictionless/schema/field_descriptor.py @@ -0,0 +1,363 @@ +"""field_descriptor.py provides pydantic Models for Field descriptors""" + +from __future__ import annotations + +import datetime +from typing import Any, Dict, List, Literal, Optional, Union + +import pydantic +from typing_extensions import Self + +from .. import settings +from .field_constraints import ( + BaseConstraints, + CollectionConstraints, + JSONConstraints, + StringConstraints, + ValueConstraints, +) + + +class BaseFieldDescriptor(pydantic.BaseModel): + """Data model of a (unspecialised) field descriptor""" + + name: str + """ + The field descriptor MUST contain a name property. + """ + + title: Optional[str] = None + """ + A human readable label or title for the field + """ + + description: Optional[str] = None + """ + A description for this field e.g. “The recipient of the funds” + """ + + missing_values: Optional[List[str]] = pydantic.Field( + default=None, alias="missingValues" + ) + """ + A list of field values to consider as null values + """ + + example: Optional[str] = None + """ + An example of a value for the field. + """ + + @pydantic.model_validator(mode="before") + @classmethod + def compat(cls, data: Dict[str, Any]) -> Dict[str, Any]: + # Backward compatibility for field.format + + format_ = data.get("format") + if format_: + if format_.startswith("fmt:"): + data["format"] = format_[4:] + + return data + + +class BooleanFieldDescriptor(BaseFieldDescriptor): + """The field contains boolean (true/false) data.""" + + type: Literal["boolean"] = "boolean" + + format: Optional[Literal["default"]] = None + constraints: Optional[BaseConstraints[bool]] = None + + true_values: Optional[List[str]] = pydantic.Field( + default=settings.DEFAULT_TRUE_VALUES, + alias="trueValues", + validation_alias=pydantic.AliasChoices("trueValues", "true_values"), + ) + """ + Values to be interpreted as “true” for boolean fields + """ + + false_values: Optional[List[str]] = pydantic.Field( + default=settings.DEFAULT_FALSE_VALUES, + alias="falseValues", + validation_alias=pydantic.AliasChoices("falseValues", "false_values"), + ) + """ + Values to be interpreted as “false” for boolean fields + """ + + def read_value(self, cell: Any): + if isinstance(cell, bool) and cell is True or cell is False: + return cell + + if isinstance(cell, str): + if self.true_values and cell in self.true_values: + return True + if self.false_values and cell in self.false_values: + return False + return None + + def write_value(self, cell: Any): + if self.true_values and self.false_values: + return self.true_values[0] if cell else self.false_values[0] + return None + + @pydantic.model_validator(mode="after") + def validate_example(self) -> Self: + # If example is provided, check it's in true_values or false_values + if self.example is not None: + allowed_values = (self.true_values or []) + (self.false_values or []) + if self.example not in allowed_values: + raise ValueError( + f'example value "{self.example}" for field "{self.name}" is not valid' + ) + + return self + + +class ArrayFieldDescriptor(BaseFieldDescriptor): + """The field contains a valid JSON array.""" + + type: Literal["array"] = "array" + format: Optional[Literal["default"]] = None + constraints: Optional[JSONConstraints] = None + + # TODO type is not accurate : array item are unnamed, not described etc + array_item: Optional[FieldDescriptor] = pydantic.Field( + default=None, alias="arrayItem" + ) + + +class AnyFieldDescriptor(BaseFieldDescriptor): + """The field contains values of a unspecified or mixed type.""" + + type: Literal["any"] = "any" + format: Optional[Literal["default"]] = None + constraints: Optional[BaseConstraints[str]] = None + + +class DateFieldDescriptor(BaseFieldDescriptor): + """he field contains a date without a time.""" + + type: Literal["date"] = "date" + format: Optional[str] = None + constraints: Optional[ValueConstraints[str]] = None + + +class DatetimeFieldDescriptor(BaseFieldDescriptor): + """The field contains a date with a time.""" + + type: Literal["datetime"] = "datetime" + format: Optional[str] = None + constraints: Optional[ValueConstraints[datetime.datetime]] = None + + +class DurationFieldDescriptor(BaseFieldDescriptor): + """The field contains a duration of time.""" + + type: Literal["duration"] = "duration" + format: Optional[Literal["default"]] = None + constraints: Optional[ValueConstraints[str]] = None + + +IGeojsonFormat = Literal[ + "default", + "topojson", +] + + +class GeoJSONFieldDescriptor(BaseFieldDescriptor): + """The field contains a JSON object according to GeoJSON or TopoJSON spec.""" + + type: Literal["geojson"] = "geojson" + format: Optional[IGeojsonFormat] = None + constraints: Optional[BaseConstraints[str]] = None + + +class GeoPointFieldDescriptor(BaseFieldDescriptor): + """The field contains data describing a geographic point.""" + + type: Literal["geopoint"] = "geopoint" + format: Optional[IGeojsonFormat] = None + constraints: Optional[BaseConstraints[str]] = None + + +class CategoryDict(pydantic.BaseModel): + value: str + label: Optional[str] = None + + +ICategories = Union[ + List[str], + List[CategoryDict], +] + + +class IntegerFieldDescriptor(BaseFieldDescriptor): + """The field contains integers - that is whole numbers.""" + + type: Literal["integer"] = "integer" + format: Optional[Literal["default"]] = None + constraints: Optional[ValueConstraints[int]] = None + + categories: Optional[ICategories] = None + """ + Property to restrict the field to a finite set of possible values + """ + + categories_ordered: Optional[bool] = pydantic.Field( + default=None, alias="categoriesOrdered" + ) + """ + When categoriesOrdered is true, implementations SHOULD regard the order of + appearance of the values in the categories property as their natural order. + """ + + group_char: Optional[str] = pydantic.Field(default=None, alias="groupChar") + """ + String whose value is used to group digits for integer/number fields + """ + + bare_number: Optional[bool] = pydantic.Field(default=None, alias="bareNumber") + """ + If false leading and trailing non numbers will be removed for integer/number fields + """ + + +IItemType = Literal[ + "boolean", + "date", + "datetime", + "integer", + "number", + "string", + "time", +] + + +class ListFieldDescriptor(BaseFieldDescriptor): + """The field contains data that is an ordered + one-level depth collection of primitive values with a fixed item type. + """ + + type: Literal["list"] = "list" + format: Optional[Literal["default"]] = None + constraints: CollectionConstraints = pydantic.Field( + default_factory=CollectionConstraints + ) + + delimiter: Optional[str] = None + """ + Specifies the character sequence which separates lexically represented list items. + """ + + item_type: Optional[IItemType] = pydantic.Field(default=None, alias="itemType") + """ + Specifies the list item type in terms of existent Table Schema types. + """ + + +class NumberFieldDescriptor(BaseFieldDescriptor): + """The field contains numbers of any kind including decimals.""" + + type: Literal["number"] = "number" + format: Optional[Literal["default"]] = None + constraints: Optional[ValueConstraints[float]] = None + + decimal_char: Optional[str] = pydantic.Field(default=None, alias="decimalChar") + """ + String whose value is used to represent a decimal point for number fields + """ + + group_char: Optional[str] = pydantic.Field(default=None, alias="groupChar") + """ + String whose value is used to group digits for integer/number fields + """ + + bare_number: Optional[bool] = pydantic.Field(default=None, alias="bareNumber") + """ + If false leading and trailing non numbers will be removed for integer/number fields + """ + + +class ObjectFieldDescriptor(BaseFieldDescriptor): + """The field contains a valid JSON object.""" + + type: Literal["object"] = "object" + format: Optional[Literal["default"]] = None + constraints: Optional[JSONConstraints] = None + + +IStringFormat = Literal[ + "binary", + "default", + "email", + "uri", + "uuid", + # Unofficial + "wkt", +] + + +class StringFieldDescriptor(BaseFieldDescriptor): + """The field contains strings, that is, sequences of characters.""" + + type: Literal["string"] = "string" + format: Optional[IStringFormat] = None + constraints: StringConstraints = pydantic.Field(default_factory=StringConstraints) + + categories: Optional[ICategories] = None + """ + Property to restrict the field to a finite set of possible values + """ + + categoriesOrdered: Optional[bool] = None + """ + When categoriesOrdered is true, implementations SHOULD regard the order of + appearance of the values in the categories property as their natural order. + """ + + +class TimeFieldDescriptor(BaseFieldDescriptor): + """The field contains a time without a date.""" + + type: Literal["time"] = "time" + format: Optional[str] = None + constraints: Optional[ValueConstraints[datetime.time]] = None + + +class YearFieldDescriptor(BaseFieldDescriptor): + """The field contains a calendar year.""" + + type: Literal["year"] = "year" + format: Optional[Literal["default"]] = None + constraints: Optional[ValueConstraints[int]] = None + + +class YearmonthFieldDescriptor(BaseFieldDescriptor): + """The field contains a specific month of a specific year.""" + + type: Literal["yearmonth"] = "yearmonth" + format: Optional[Literal["default"]] = None + constraints: Optional[ValueConstraints[str]] = None + + +FieldDescriptor = Union[ + AnyFieldDescriptor, + ArrayFieldDescriptor, + BooleanFieldDescriptor, + DateFieldDescriptor, + DatetimeFieldDescriptor, + DurationFieldDescriptor, + GeoJSONFieldDescriptor, + GeoPointFieldDescriptor, + IntegerFieldDescriptor, + ListFieldDescriptor, + NumberFieldDescriptor, + ObjectFieldDescriptor, + StringFieldDescriptor, + TimeFieldDescriptor, + YearFieldDescriptor, + YearmonthFieldDescriptor, +] From 190afec835f53ad67c4be2bd1142167d23d5b17a Mon Sep 17 00:00:00 2001 From: Pierre Camilleri Date: Mon, 4 Aug 2025 21:42:33 +0200 Subject: [PATCH 2/5] =?UTF-8?q?=F0=9F=94=B5=20do=20not=20change=20private?= =?UTF-8?q?=20property?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- frictionless/detector/detector.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/frictionless/detector/detector.py b/frictionless/detector/detector.py index 6ff0896527..93e3652f0d 100644 --- a/frictionless/detector/detector.py +++ b/frictionless/detector/detector.py @@ -354,15 +354,17 @@ def detect_schema( runner_fields: List[Field] = [] # we use shared fields for candidate in field_candidates: descriptor = candidate.copy() + + if descriptor["type"] == "boolean": + if self.field_true_values != settings.DEFAULT_TRUE_VALUES: + descriptor["true_values"] = self.field_true_values # type: ignore + if self.field_false_values != settings.DEFAULT_FALSE_VALUES: + descriptor["false_values"] = self.field_false_values # type: ignore + descriptor["name"] = "shared" field = Field.from_descriptor(descriptor) if field.type == "number" and self.field_float_numbers: field.float_number = True # type: ignore - elif field.type == "boolean": - if self.field_true_values != settings.DEFAULT_TRUE_VALUES: - field._descriptor.true_values = self.field_true_values # type: ignore - if self.field_false_values != settings.DEFAULT_FALSE_VALUES: - field._descriptor.false_values = self.field_false_values # type: ignore runner_fields.append(field) for index, name in enumerate(names): From e88d66f29cf32eecd69de5dfe262f58e6526f0ec Mon Sep 17 00:00:00 2001 From: Pierre Camilleri Date: Wed, 13 Aug 2025 10:53:35 +0200 Subject: [PATCH 3/5] Remove fields/boolean.py --- frictionless/fields/__init__.py | 2 +- frictionless/fields/boolean.py | 39 ++----------------- .../{schema => fields}/field_constraints.py | 0 .../{schema => fields}/field_descriptor.py | 31 ++++++++++++--- frictionless/schema/field.py | 2 +- 5 files changed, 31 insertions(+), 43 deletions(-) rename frictionless/{schema => fields}/field_constraints.py (100%) rename frictionless/{schema => fields}/field_descriptor.py (90%) diff --git a/frictionless/fields/__init__.py b/frictionless/fields/__init__.py index 550d07b079..3dfea7fd50 100644 --- a/frictionless/fields/__init__.py +++ b/frictionless/fields/__init__.py @@ -1,6 +1,6 @@ from .any import AnyField as AnyField from .array import ArrayField as ArrayField -from .boolean import BooleanField as BooleanField +from .boolean import BooleanField from .date import DateField as DateField from .datetime import DatetimeField as DatetimeField from .duration import DurationField as DurationField diff --git a/frictionless/fields/boolean.py b/frictionless/fields/boolean.py index 518d6c9c44..04db7e159a 100644 --- a/frictionless/fields/boolean.py +++ b/frictionless/fields/boolean.py @@ -1,39 +1,8 @@ -from __future__ import annotations +from ..schema.field import Field -from typing import List -import attrs - -from .. import settings -from ..schema import Field - - -@attrs.define(kw_only=True, repr=False) class BooleanField(Field): + ### TEMP Only required for Metadata compatibility + ### This is required because "metadata_import" makes a distinction based + ### on the "type" property (`is_typed_class`) type = "boolean" - builtin = True - supported_constraints = [ - "required", - "enum", - ] - - true_values: List[str] = attrs.field(factory=settings.DEFAULT_TRUE_VALUES.copy) - """ - It defines the values to be read as true values while reading data. The default - true values are ["true", "True", "TRUE", "1"]. - """ - - false_values: List[str] = attrs.field(factory=settings.DEFAULT_FALSE_VALUES.copy) - """ - It defines the values to be read as false values while reading data. The default - true values are ["false", "False", "FALSE", "0"]. - """ - - # Metadata - - metadata_profile_patch = { - "properties": { - "trueValues": {"type": "array", "items": {"type": "string"}}, - "falseValues": {"type": "array", "items": {"type": "string"}}, - } - } diff --git a/frictionless/schema/field_constraints.py b/frictionless/fields/field_constraints.py similarity index 100% rename from frictionless/schema/field_constraints.py rename to frictionless/fields/field_constraints.py diff --git a/frictionless/schema/field_descriptor.py b/frictionless/fields/field_descriptor.py similarity index 90% rename from frictionless/schema/field_descriptor.py rename to frictionless/fields/field_descriptor.py index d3e05d0881..1512e72825 100644 --- a/frictionless/schema/field_descriptor.py +++ b/frictionless/fields/field_descriptor.py @@ -3,7 +3,7 @@ from __future__ import annotations import datetime -from typing import Any, Dict, List, Literal, Optional, Union +from typing import Any, ClassVar, Dict, List, Literal, Optional, Union import pydantic from typing_extensions import Self @@ -17,6 +17,9 @@ ValueConstraints, ) +TableSchemaTypes = Union[bool, str, float, int] +"""Python equivalents of types supported by the Table schema specification""" + class BaseFieldDescriptor(pydantic.BaseModel): """Data model of a (unspecialised) field descriptor""" @@ -64,7 +67,7 @@ def compat(cls, data: Dict[str, Any]) -> Dict[str, Any]: class BooleanFieldDescriptor(BaseFieldDescriptor): """The field contains boolean (true/false) data.""" - type: Literal["boolean"] = "boolean" + type: ClassVar[Literal["boolean"]] = "boolean" format: Optional[Literal["default"]] = None constraints: Optional[BaseConstraints[bool]] = None @@ -87,8 +90,23 @@ class BooleanFieldDescriptor(BaseFieldDescriptor): Values to be interpreted as “false” for boolean fields """ - def read_value(self, cell: Any): - if isinstance(cell, bool) and cell is True or cell is False: + def read_value(self, cell: TableSchemaTypes) -> Optional[bool]: + """read_value converts the physical (possibly typed) representation to + a logical boolean representation. + + See "Data representation" in the glossary for more details. + https://datapackage.org/standard/glossary/#data-representation + + If the physical representation is already typed as a boolean, the + value is returned unchanged. + + If the physical representation is a string, then the string is parsed + as a boolean depending on true_values and false_values options. `None` + is returned if the string cannot be parsed. + + Any other typed input will return `None`. + """ + if isinstance(cell, bool): return cell if isinstance(cell, str): @@ -96,9 +114,10 @@ def read_value(self, cell: Any): return True if self.false_values and cell in self.false_values: return False - return None - def write_value(self, cell: Any): + return None + + def write_value(self, cell: Optional[bool]) -> Optional[str]: if self.true_values and self.false_values: return self.true_values[0] if cell else self.false_values[0] return None diff --git a/frictionless/schema/field.py b/frictionless/schema/field.py index f5499cfb14..b55ef46000 100644 --- a/frictionless/schema/field.py +++ b/frictionless/schema/field.py @@ -11,9 +11,9 @@ from .. import errors, settings from ..exception import FrictionlessException +from ..fields.field_descriptor import BooleanFieldDescriptor, FieldDescriptor from ..metadata import Metadata from ..system import system -from .field_descriptor import BooleanFieldDescriptor, FieldDescriptor if TYPE_CHECKING: from ..types import IDescriptor From 02ce1af5f0ccf67efa545a3c17b534e101cebf77 Mon Sep 17 00:00:00 2001 From: BmnQuentin <59441455+BmnQuentin@users.noreply.github.com> Date: Wed, 10 Dec 2025 16:00:56 +0100 Subject: [PATCH 4/5] refactor: move IntegerField to pydantic (#1759) Integrated pydantic logic to IntegerField. All tests pass (apart from 5 console ones not related to modifications) Fixed a few issued: - examples should enable Any type, not only strings - descriptor was broken during the merge of two kinds --- frictionless/fields/array_descriptor.py | 0 frictionless/fields/base_field_descriptor.py | 65 ++++++ frictionless/fields/boolean.py | 3 +- frictionless/fields/boolean_descriptor.py | 50 +++++ frictionless/fields/date.py | 53 ----- frictionless/fields/date_descriptor.py | 53 +++++ frictionless/fields/field_descriptor.py | 197 +++---------------- frictionless/fields/integer.py | 74 +------ frictionless/fields/integer_descriptor.py | 87 ++++++++ frictionless/schema/field.py | 41 +++- 10 files changed, 319 insertions(+), 304 deletions(-) create mode 100644 frictionless/fields/array_descriptor.py create mode 100644 frictionless/fields/base_field_descriptor.py create mode 100644 frictionless/fields/boolean_descriptor.py create mode 100644 frictionless/fields/date_descriptor.py create mode 100644 frictionless/fields/integer_descriptor.py diff --git a/frictionless/fields/array_descriptor.py b/frictionless/fields/array_descriptor.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/frictionless/fields/base_field_descriptor.py b/frictionless/fields/base_field_descriptor.py new file mode 100644 index 0000000000..d89bbac610 --- /dev/null +++ b/frictionless/fields/base_field_descriptor.py @@ -0,0 +1,65 @@ +"""base_field_descriptor.py provides the base Pydantic model for all field descriptors""" + +from __future__ import annotations + +from pydantic import BaseModel, Field as PydanticField, model_validator +from typing import Any, Dict, List, Optional +from typing_extensions import Self + + +class BaseFieldDescriptor(BaseModel): + """Data model of a (unspecialised) field descriptor""" + + name: str + """ + The field descriptor MUST contain a name property. + """ + + title: Optional[str] = None + """ + A human readable label or title for the field + """ + + description: Optional[str] = None + """ + A description for this field e.g. "The recipient of the funds" + """ + + missing_values: Optional[List[str]] = PydanticField( + default=None, alias="missingValues" + ) + """ + A list of field values to consider as null values + """ + + example: Optional[Any] = None + """ + An example of a value for the field. + """ + + @model_validator(mode="before") + @classmethod + def compat(cls, data: Dict[str, Any]) -> Dict[str, Any]: + # Backward compatibility for field.format + + format_ = data.get("format") + if format_: + if format_.startswith("fmt:"): + data["format"] = format_[4:] + + return data + + @model_validator(mode="after") + def validate_example(self) -> Self: + """Validate that the example value can be converted using read_value() if available""" + if self.example is not None: + if hasattr(self, "read_value"): + read_value_method = getattr(self, "read_value") + result = read_value_method(self.example) + if result is None: + raise ValueError( + f'example value "{self.example}" for field "{self.name}" is not valid' + ) + + return self + diff --git a/frictionless/fields/boolean.py b/frictionless/fields/boolean.py index 04db7e159a..365f8a6eef 100644 --- a/frictionless/fields/boolean.py +++ b/frictionless/fields/boolean.py @@ -1,8 +1,9 @@ +from __future__ import annotations from ..schema.field import Field - class BooleanField(Field): ### TEMP Only required for Metadata compatibility ### This is required because "metadata_import" makes a distinction based ### on the "type" property (`is_typed_class`) type = "boolean" + \ No newline at end of file diff --git a/frictionless/fields/boolean_descriptor.py b/frictionless/fields/boolean_descriptor.py new file mode 100644 index 0000000000..031540bebe --- /dev/null +++ b/frictionless/fields/boolean_descriptor.py @@ -0,0 +1,50 @@ +from typing import Any, ClassVar, List, Literal, Optional + +from pydantic import Field as PydanticField, AliasChoices + +from .. import settings +from .base_field_descriptor import BaseFieldDescriptor +from .field_constraints import BaseConstraints + +class BooleanFieldDescriptor(BaseFieldDescriptor): + """The field contains boolean (true/false) data.""" + + type: ClassVar[Literal["boolean"]] = "boolean" + + format: Optional[Literal["default"]] = None + constraints: Optional[BaseConstraints[bool]] = None + + true_values: Optional[List[str]] = PydanticField( + default=settings.DEFAULT_TRUE_VALUES, + alias="trueValues", + validation_alias=AliasChoices("trueValues", "true_values"), + ) + """ + Values to be interpreted as "true" for boolean fields + """ + + false_values: Optional[List[str]] = PydanticField( + default=settings.DEFAULT_FALSE_VALUES, + alias="falseValues", + validation_alias=AliasChoices("falseValues", "false_values"), + ) + """ + Values to be interpreted as "false" for boolean fields + """ + + def read_value(self, cell: Any) -> Optional[bool]: + if isinstance(cell, bool): + return cell + + if isinstance(cell, str): + if self.true_values and cell in self.true_values: + return True + if self.false_values and cell in self.false_values: + return False + + return None + + def write_value(self, cell: Optional[bool]) -> Optional[str]: + if self.true_values and self.false_values: + return self.true_values[0] if cell else self.false_values[0] + return None diff --git a/frictionless/fields/date.py b/frictionless/fields/date.py index 809f037ec1..b13521ca1b 100644 --- a/frictionless/fields/date.py +++ b/frictionless/fields/date.py @@ -1,15 +1,8 @@ from __future__ import annotations -from datetime import date, datetime -from typing import Any - import attrs - -from .. import settings -from ..platform import platform from ..schema import Field - @attrs.define(kw_only=True, repr=False) class DateField(Field): type = "date" @@ -21,49 +14,3 @@ class DateField(Field): "enum", ] - # Read - - # TODO: use different value_readers based on format (see string) - def create_value_reader(self): - # Create reader - def value_reader(cell: Any): - if isinstance(cell, datetime): - value_time = cell.time() - if ( - value_time.hour == 0 - and value_time.minute == 0 - and value_time.second == 0 - ): - return datetime(cell.year, cell.month, cell.day).date() - else: - return None - if isinstance(cell, date): - return cell - if not isinstance(cell, str): - return None - try: - if self.format == "default": - cell = datetime.strptime(cell, settings.DEFAULT_DATE_PATTERN).date() - elif self.format == "any": - cell = platform.dateutil_parser.parse(cell).date() - else: - cell = datetime.strptime(cell, self.format).date() - except Exception: - return None - return cell - - return value_reader - - # Write - - def create_value_writer(self): - # Create format - format = self.format - if format == settings.DEFAULT_FIELD_FORMAT: - format = settings.DEFAULT_DATE_PATTERN - - # Create writer - def value_writer(cell: Any): - return cell.strftime(format) - - return value_writer diff --git a/frictionless/fields/date_descriptor.py b/frictionless/fields/date_descriptor.py new file mode 100644 index 0000000000..b332c55508 --- /dev/null +++ b/frictionless/fields/date_descriptor.py @@ -0,0 +1,53 @@ +import datetime +from typing import Any, Literal, Optional + + +from .. import settings +from .base_field_descriptor import BaseFieldDescriptor +from .field_constraints import ValueConstraints + + +class DateFieldDescriptor(BaseFieldDescriptor): + """The field contains a date without a time.""" + + type: Literal["date"] = "date" + format: Optional[str] = None + constraints: Optional[ValueConstraints[str]] = None + + def read_value(self, cell: Any) -> Optional[datetime.date]: + from datetime import date, datetime + from ..platform import platform + + if isinstance(cell, datetime): + value_time = cell.time() + if ( + value_time.hour == 0 + and value_time.minute == 0 + and value_time.second == 0 + ): + return datetime(cell.year, cell.month, cell.day).date() + else: + return None + if isinstance(cell, date): + return cell + if not isinstance(cell, str): + return None + try: + format_value = self.format or "default" + if format_value == "default": + cell = datetime.strptime(cell, settings.DEFAULT_DATE_PATTERN).date() + elif format_value == "any": + cell = platform.dateutil_parser.parse(cell).date() + else: + cell = datetime.strptime(cell, format_value).date() + except Exception: + return None + return cell + + def write_value(self, cell: Optional[datetime.date]) -> Optional[str]: + if cell is None: + return None + format_value = self.format or "default" + if format_value == settings.DEFAULT_FIELD_FORMAT: + format_value = settings.DEFAULT_DATE_PATTERN + return cell.strftime(format_value) diff --git a/frictionless/fields/field_descriptor.py b/frictionless/fields/field_descriptor.py index 1512e72825..32b9d95e9c 100644 --- a/frictionless/fields/field_descriptor.py +++ b/frictionless/fields/field_descriptor.py @@ -1,14 +1,11 @@ -"""field_descriptor.py provides pydantic Models for Field descriptors""" - from __future__ import annotations import datetime -from typing import Any, ClassVar, Dict, List, Literal, Optional, Union +from typing import List, Literal, Optional, Union -import pydantic -from typing_extensions import Self +from pydantic import Field as PydanticField, BaseModel -from .. import settings +from .base_field_descriptor import BaseFieldDescriptor from .field_constraints import ( BaseConstraints, CollectionConstraints, @@ -17,122 +14,9 @@ ValueConstraints, ) -TableSchemaTypes = Union[bool, str, float, int] -"""Python equivalents of types supported by the Table schema specification""" - - -class BaseFieldDescriptor(pydantic.BaseModel): - """Data model of a (unspecialised) field descriptor""" - - name: str - """ - The field descriptor MUST contain a name property. - """ - - title: Optional[str] = None - """ - A human readable label or title for the field - """ - - description: Optional[str] = None - """ - A description for this field e.g. “The recipient of the funds” - """ - - missing_values: Optional[List[str]] = pydantic.Field( - default=None, alias="missingValues" - ) - """ - A list of field values to consider as null values - """ - - example: Optional[str] = None - """ - An example of a value for the field. - """ - - @pydantic.model_validator(mode="before") - @classmethod - def compat(cls, data: Dict[str, Any]) -> Dict[str, Any]: - # Backward compatibility for field.format - - format_ = data.get("format") - if format_: - if format_.startswith("fmt:"): - data["format"] = format_[4:] - - return data - - -class BooleanFieldDescriptor(BaseFieldDescriptor): - """The field contains boolean (true/false) data.""" - - type: ClassVar[Literal["boolean"]] = "boolean" - - format: Optional[Literal["default"]] = None - constraints: Optional[BaseConstraints[bool]] = None - - true_values: Optional[List[str]] = pydantic.Field( - default=settings.DEFAULT_TRUE_VALUES, - alias="trueValues", - validation_alias=pydantic.AliasChoices("trueValues", "true_values"), - ) - """ - Values to be interpreted as “true” for boolean fields - """ - - false_values: Optional[List[str]] = pydantic.Field( - default=settings.DEFAULT_FALSE_VALUES, - alias="falseValues", - validation_alias=pydantic.AliasChoices("falseValues", "false_values"), - ) - """ - Values to be interpreted as “false” for boolean fields - """ - - def read_value(self, cell: TableSchemaTypes) -> Optional[bool]: - """read_value converts the physical (possibly typed) representation to - a logical boolean representation. - - See "Data representation" in the glossary for more details. - https://datapackage.org/standard/glossary/#data-representation - - If the physical representation is already typed as a boolean, the - value is returned unchanged. - - If the physical representation is a string, then the string is parsed - as a boolean depending on true_values and false_values options. `None` - is returned if the string cannot be parsed. - - Any other typed input will return `None`. - """ - if isinstance(cell, bool): - return cell - - if isinstance(cell, str): - if self.true_values and cell in self.true_values: - return True - if self.false_values and cell in self.false_values: - return False - - return None - - def write_value(self, cell: Optional[bool]) -> Optional[str]: - if self.true_values and self.false_values: - return self.true_values[0] if cell else self.false_values[0] - return None - - @pydantic.model_validator(mode="after") - def validate_example(self) -> Self: - # If example is provided, check it's in true_values or false_values - if self.example is not None: - allowed_values = (self.true_values or []) + (self.false_values or []) - if self.example not in allowed_values: - raise ValueError( - f'example value "{self.example}" for field "{self.name}" is not valid' - ) - - return self +from .boolean_descriptor import BooleanFieldDescriptor +from .date_descriptor import DateFieldDescriptor +from .integer_descriptor import IntegerFieldDescriptor class ArrayFieldDescriptor(BaseFieldDescriptor): @@ -142,8 +26,9 @@ class ArrayFieldDescriptor(BaseFieldDescriptor): format: Optional[Literal["default"]] = None constraints: Optional[JSONConstraints] = None - # TODO type is not accurate : array item are unnamed, not described etc - array_item: Optional[FieldDescriptor] = pydantic.Field( + # TODO type is not accurate : array item are unnamed, not described etc + # Using string annotation to avoid circular import + array_item: Optional["FieldDescriptor"] = PydanticField( default=None, alias="arrayItem" ) @@ -156,14 +41,6 @@ class AnyFieldDescriptor(BaseFieldDescriptor): constraints: Optional[BaseConstraints[str]] = None -class DateFieldDescriptor(BaseFieldDescriptor): - """he field contains a date without a time.""" - - type: Literal["date"] = "date" - format: Optional[str] = None - constraints: Optional[ValueConstraints[str]] = None - - class DatetimeFieldDescriptor(BaseFieldDescriptor): """The field contains a date with a time.""" @@ -202,7 +79,8 @@ class GeoPointFieldDescriptor(BaseFieldDescriptor): constraints: Optional[BaseConstraints[str]] = None -class CategoryDict(pydantic.BaseModel): +class CategoryDict(BaseModel): + """Category dictionary for field categories.""" value: str label: Optional[str] = None @@ -211,37 +89,7 @@ class CategoryDict(pydantic.BaseModel): List[str], List[CategoryDict], ] - - -class IntegerFieldDescriptor(BaseFieldDescriptor): - """The field contains integers - that is whole numbers.""" - - type: Literal["integer"] = "integer" - format: Optional[Literal["default"]] = None - constraints: Optional[ValueConstraints[int]] = None - - categories: Optional[ICategories] = None - """ - Property to restrict the field to a finite set of possible values - """ - - categories_ordered: Optional[bool] = pydantic.Field( - default=None, alias="categoriesOrdered" - ) - """ - When categoriesOrdered is true, implementations SHOULD regard the order of - appearance of the values in the categories property as their natural order. - """ - - group_char: Optional[str] = pydantic.Field(default=None, alias="groupChar") - """ - String whose value is used to group digits for integer/number fields - """ - - bare_number: Optional[bool] = pydantic.Field(default=None, alias="bareNumber") - """ - If false leading and trailing non numbers will be removed for integer/number fields - """ +"""Categories type used by IntegerFieldDescriptor and StringFieldDescriptor""" IItemType = Literal[ @@ -262,7 +110,7 @@ class ListFieldDescriptor(BaseFieldDescriptor): type: Literal["list"] = "list" format: Optional[Literal["default"]] = None - constraints: CollectionConstraints = pydantic.Field( + constraints: CollectionConstraints = PydanticField( default_factory=CollectionConstraints ) @@ -271,7 +119,7 @@ class ListFieldDescriptor(BaseFieldDescriptor): Specifies the character sequence which separates lexically represented list items. """ - item_type: Optional[IItemType] = pydantic.Field(default=None, alias="itemType") + item_type: Optional[IItemType] = PydanticField(default=None, alias="itemType") """ Specifies the list item type in terms of existent Table Schema types. """ @@ -284,17 +132,17 @@ class NumberFieldDescriptor(BaseFieldDescriptor): format: Optional[Literal["default"]] = None constraints: Optional[ValueConstraints[float]] = None - decimal_char: Optional[str] = pydantic.Field(default=None, alias="decimalChar") + decimal_char: Optional[str] = PydanticField(default=None, alias="decimalChar") """ String whose value is used to represent a decimal point for number fields """ - group_char: Optional[str] = pydantic.Field(default=None, alias="groupChar") + group_char: Optional[str] = PydanticField(default=None, alias="groupChar") """ String whose value is used to group digits for integer/number fields """ - bare_number: Optional[bool] = pydantic.Field(default=None, alias="bareNumber") + bare_number: Optional[bool] = PydanticField(default=None, alias="bareNumber") """ If false leading and trailing non numbers will be removed for integer/number fields """ @@ -324,7 +172,7 @@ class StringFieldDescriptor(BaseFieldDescriptor): type: Literal["string"] = "string" format: Optional[IStringFormat] = None - constraints: StringConstraints = pydantic.Field(default_factory=StringConstraints) + constraints: StringConstraints = PydanticField(default_factory=StringConstraints) categories: Optional[ICategories] = None """ @@ -361,17 +209,16 @@ class YearmonthFieldDescriptor(BaseFieldDescriptor): format: Optional[Literal["default"]] = None constraints: Optional[ValueConstraints[str]] = None - FieldDescriptor = Union[ AnyFieldDescriptor, - ArrayFieldDescriptor, - BooleanFieldDescriptor, - DateFieldDescriptor, + ArrayFieldDescriptor, # wip + BooleanFieldDescriptor, # v + DateFieldDescriptor, # v DatetimeFieldDescriptor, DurationFieldDescriptor, GeoJSONFieldDescriptor, GeoPointFieldDescriptor, - IntegerFieldDescriptor, + IntegerFieldDescriptor, # v ListFieldDescriptor, NumberFieldDescriptor, ObjectFieldDescriptor, diff --git a/frictionless/fields/integer.py b/frictionless/fields/integer.py index 28586607fc..40951afbaa 100644 --- a/frictionless/fields/integer.py +++ b/frictionless/fields/integer.py @@ -1,17 +1,10 @@ from __future__ import annotations +from ..schema.field import Field -import re -from decimal import Decimal -from typing import Any - -import attrs - -from .. import settings -from ..schema import Field - - -@attrs.define(kw_only=True, repr=False) class IntegerField(Field): + ### TEMP Only required for Metadata compatibility + ### This is required because "metadata_import" makes a distinction based + ### on the "type" property (`is_typed_class`) type = "integer" builtin = True supported_constraints = [ @@ -20,62 +13,3 @@ class IntegerField(Field): "maximum", "enum", ] - - bare_number: bool = settings.DEFAULT_BARE_NUMBER - """ - It specifies that the value is a bare number. If true, the pattern to - remove non digit character does not get applied and vice versa. - The default value is True. - """ - - # Read - - def create_value_reader(self): - # Create pattern - pattern = None - if not self.bare_number: - pattern = re.compile(r"((^[^-\d]*)|(\D*$))") - - # Create reader - def value_reader(cell: Any): - if isinstance(cell, str): - cell = cell.strip() - - # Process the cell - if pattern: - cell = pattern.sub("", cell) - - # Cast the cell - try: - return int(cell) - except Exception: - return None - - elif cell is True or cell is False: - return None - elif isinstance(cell, int): - return cell - elif isinstance(cell, float) and cell.is_integer(): - return int(cell) - elif isinstance(cell, Decimal) and cell % 1 == 0: - return int(cell) - return None - - return value_reader - - # Write - - def create_value_writer(self): - # Create writer - def value_writer(cell: Any): - return str(cell) - - return value_writer - - # Metadata - - metadata_profile_patch = { - "properties": { - "bareNumber": {"type": "boolean"}, - } - } diff --git a/frictionless/fields/integer_descriptor.py b/frictionless/fields/integer_descriptor.py new file mode 100644 index 0000000000..675e336bcb --- /dev/null +++ b/frictionless/fields/integer_descriptor.py @@ -0,0 +1,87 @@ + +import re +from decimal import Decimal +from typing import Any, ClassVar, Literal, Optional, Pattern, Union, List + +from pydantic import Field as PydanticField, BaseModel + +from .. import settings +from .base_field_descriptor import BaseFieldDescriptor +from .field_constraints import ValueConstraints + + +class CategoryDict(BaseModel): + """Category dictionary for field categories.""" + value: str + label: Optional[str] = None + +ICategories = Union[ + List[str], + List[CategoryDict], +] + +class IntegerFieldDescriptor(BaseFieldDescriptor): + """The field contains integers - that is whole numbers.""" + + type: Literal["integer"] = "integer" + format: Optional[Literal["default"]] = None + constraints: Optional[ValueConstraints[int]] = None + + categories: Optional[ICategories] = None + """ + Property to restrict the field to a finite set of possible values + """ + + categories_ordered: Optional[bool] = PydanticField( + default=None, alias="categoriesOrdered" + ) + """ + When categoriesOrdered is true, implementations SHOULD regard the order of + appearance of the values in the categories property as their natural order. + """ + + group_char: Optional[str] = PydanticField(default=None, alias="groupChar") + """ + String whose value is used to group digits for integer/number fields + """ + + bare_number: bool = PydanticField( + default=settings.DEFAULT_BARE_NUMBER, alias="bareNumber" + ) + """ + If false leading and trailing non numbers will be removed for integer/number fields + """ + + pattern: ClassVar[Pattern[str]] = re.compile(r"((^[^-\d]*)|(\D*$))") + + def read_value(self, cell: Any) -> Optional[int]: + if isinstance(cell, bool): + return None + + elif isinstance(cell, int): + return cell + + elif isinstance(cell, str): + cell = cell.strip() + + # Process the cell (remove non-digit characters if bare_number is False) + if not self.bare_number: + cell = self.pattern.sub("", cell) + + # Cast the cell + try: + return int(cell) + except Exception: + return None + + elif isinstance(cell, float) and cell.is_integer(): + return int(cell) + elif isinstance(cell, Decimal) and cell % 1 == 0: + return int(cell) + + return None + + def write_value(self, cell: Optional[int]) -> Optional[str]: + if cell is None: + return None + return str(cell) diff --git a/frictionless/schema/field.py b/frictionless/schema/field.py index b55ef46000..4114bedef5 100644 --- a/frictionless/schema/field.py +++ b/frictionless/schema/field.py @@ -11,7 +11,10 @@ from .. import errors, settings from ..exception import FrictionlessException -from ..fields.field_descriptor import BooleanFieldDescriptor, FieldDescriptor +# from ..fields.boolean_descriptor import BooleanFieldDescriptor +# from ..fields.date_descriptor import DateFieldDescriptor +# from ..fields.integer_descriptor import IntegerFieldDescriptor +from ..fields.field_descriptor import BooleanFieldDescriptor, DateFieldDescriptor, IntegerFieldDescriptor, FieldDescriptor from ..metadata import Metadata from ..system import system @@ -25,7 +28,7 @@ class Field(Metadata): """Field representation""" - _descriptor: Optional[FieldDescriptor] = None + _descriptor: Optional[ FieldDescriptor] = None name: str """ @@ -159,6 +162,10 @@ def create_value_reader(self) -> types.IValueReader: def value_reader(cell: Any): if self._descriptor and isinstance(self._descriptor, BooleanFieldDescriptor): return self._descriptor.read_value(cell) + if self._descriptor and isinstance(self._descriptor, IntegerFieldDescriptor): + return self._descriptor.read_value(cell) + if self._descriptor and isinstance(self._descriptor, DateFieldDescriptor): + return self._descriptor.read_value(cell) return cell return value_reader @@ -199,6 +206,10 @@ def create_value_writer(self) -> types.IValueWriter: def value_writer(cell: Any): if self._descriptor and isinstance(self._descriptor, BooleanFieldDescriptor): return self._descriptor.write_value(cell) + if self._descriptor and isinstance(self._descriptor, IntegerFieldDescriptor): + return self._descriptor.write_value(cell) + if self._descriptor and isinstance(self._descriptor, DateFieldDescriptor): + return self._descriptor.write_value(cell) return str(cell) return value_writer @@ -271,15 +282,35 @@ def metadata_import( except pydantic.ValidationError as ve: error = errors.SchemaError(note=str(ve)) raise FrictionlessException(error) + elif field.type == "integer": + try: + field._descriptor = IntegerFieldDescriptor.model_validate(descriptor_copy) + except pydantic.ValidationError as ve: + error = errors.SchemaError(note=str(ve)) + raise FrictionlessException(error) + elif field.type == "date": + try: + field._descriptor = DateFieldDescriptor.model_validate(descriptor_copy) + except pydantic.ValidationError as ve: + error = errors.SchemaError(note=str(ve)) + raise FrictionlessException(error) return field def to_descriptor(self, *, validate: bool = False) -> IDescriptor: - if self._descriptor and isinstance(self._descriptor, BooleanFieldDescriptor): - descr = self._descriptor.model_dump(exclude_none=True, exclude_unset=True) + if self._descriptor and isinstance( + self._descriptor, (BooleanFieldDescriptor, IntegerFieldDescriptor, DateFieldDescriptor) + ): + base_descr = super().to_descriptor(validate=validate) + # Set by_alias=True to get camelCase keys used by Frictionless (bareNumber) instead of snake_case (bare_number) + # Exclude 'name' from descriptor_descr because it may be "shared" (coming from detector.py) + descriptor_descr = self._descriptor.model_dump( + exclude_none=True, exclude_unset=True, by_alias=True, exclude={"name"} + ) ## Temporarily, Field properties have priority over ## Field._descriptor properties - descr = {**descr, **super().to_descriptor(validate=validate)} + ## Merge descriptor_descr into base_descr to preserve base order + descr = {**base_descr, **descriptor_descr} return descr else: return super().to_descriptor(validate=validate) From f2b8eccc039530cdeaec9a723d45be33102527ce Mon Sep 17 00:00:00 2001 From: BmnQuentin <59441455+BmnQuentin@users.noreply.github.com> Date: Wed, 8 Apr 2026 13:22:51 +0200 Subject: [PATCH 5/5] Refactor other fields to #1751 (#1760) --- frictionless/fields/any_descriptor.py | 21 ++ frictionless/fields/array.py | 29 -- frictionless/fields/array_descriptor.py | 40 +++ frictionless/fields/datetime.py | 46 ---- frictionless/fields/datetime_descriptor.py | 44 +++ frictionless/fields/duration.py | 29 -- frictionless/fields/duration_descriptor.py | 31 +++ frictionless/fields/field_descriptor.py | 201 ++------------ frictionless/fields/geojson.py | 55 ---- frictionless/fields/geojson_descriptor.py | 44 +++ frictionless/fields/geopoint.py | 66 ----- frictionless/fields/geopoint_descriptor.py | 66 +++++ frictionless/fields/number.py | 118 +-------- frictionless/fields/number_descriptor.py | 109 ++++++++ frictionless/fields/object.py | 30 --- frictionless/fields/object_descriptor.py | 31 +++ frictionless/fields/string.py | 98 ------- frictionless/fields/string_descriptor.py | 102 +++++++ frictionless/fields/time.py | 48 ---- frictionless/fields/time_descriptor.py | 48 ++++ frictionless/fields/year.py | 31 --- frictionless/fields/year_descriptor.py | 32 +++ frictionless/fields/yearmonth.py | 44 --- frictionless/fields/yearmonth_descriptor.py | 43 +++ frictionless/formats/jsonschema/mapper.py | 10 +- frictionless/schema/field.py | 279 ++++++++++++++++---- 26 files changed, 880 insertions(+), 815 deletions(-) create mode 100644 frictionless/fields/any_descriptor.py create mode 100644 frictionless/fields/datetime_descriptor.py create mode 100644 frictionless/fields/duration_descriptor.py create mode 100644 frictionless/fields/geojson_descriptor.py create mode 100644 frictionless/fields/geopoint_descriptor.py create mode 100644 frictionless/fields/number_descriptor.py create mode 100644 frictionless/fields/object_descriptor.py create mode 100644 frictionless/fields/string_descriptor.py create mode 100644 frictionless/fields/time_descriptor.py create mode 100644 frictionless/fields/year_descriptor.py create mode 100644 frictionless/fields/yearmonth_descriptor.py diff --git a/frictionless/fields/any_descriptor.py b/frictionless/fields/any_descriptor.py new file mode 100644 index 0000000000..87ffcf6939 --- /dev/null +++ b/frictionless/fields/any_descriptor.py @@ -0,0 +1,21 @@ +from typing import Any, Literal, Optional + +from .base_field_descriptor import BaseFieldDescriptor +from .field_constraints import BaseConstraints + + +class AnyFieldDescriptor(BaseFieldDescriptor): + """The field contains values of a unspecified or mixed type.""" + + type: Literal["any"] = "any" + format: Optional[Literal["default"]] = None + constraints: Optional[BaseConstraints[str]] = None + + def read_value(self, cell: Any) -> Any: + # Any field accepts any value as-is + return cell + + def write_value(self, cell: Any) -> Any: + # Any field returns the value as-is + return cell + diff --git a/frictionless/fields/array.py b/frictionless/fields/array.py index fb47aa655c..bf6d1c2bf8 100644 --- a/frictionless/fields/array.py +++ b/frictionless/fields/array.py @@ -1,6 +1,5 @@ from __future__ import annotations -import json from typing import Any, Dict, Optional import attrs @@ -55,34 +54,6 @@ def cell_reader(cell: Any): return cell_reader - def create_value_reader(self): - # Create reader - def value_reader(cell: Any): # type: ignore - if not isinstance(cell, list): - if isinstance(cell, str): - try: - cell = json.loads(cell) - except Exception: - return None - if not isinstance(cell, list): - return None - elif isinstance(cell, tuple): - cell = list(cell) # type: ignore - else: - return None - return cell # type: ignore - - return value_reader - - # Write - - def create_value_writer(self): - # Create writer - def value_writer(cell: Any): - return json.dumps(cell) - - return value_writer - # Metadata metadata_profile_patch = { diff --git a/frictionless/fields/array_descriptor.py b/frictionless/fields/array_descriptor.py index e69de29bb2..a890c822d5 100644 --- a/frictionless/fields/array_descriptor.py +++ b/frictionless/fields/array_descriptor.py @@ -0,0 +1,40 @@ +from __future__ import annotations + +import json +from typing import Any, Literal, Optional + +from pydantic import Field as PydanticField + +from .base_field_descriptor import BaseFieldDescriptor +from .field_constraints import JSONConstraints + + +class ArrayFieldDescriptor(BaseFieldDescriptor): + """The field contains a valid JSON array.""" + + type: Literal["array"] = "array" + format: Optional[Literal["default"]] = None + constraints: Optional[JSONConstraints] = None + # TODO: check later: + # arrayItem in Frictionless schemas is an unnamed field-like descriptor to prevent using a full FieldDescriptor with "name" (backward compatibility) + array_item: Optional[dict[str, Any]] = PydanticField(default=None, alias="arrayItem") + + def read_value(self, cell: Any) -> Optional[list[Any]]: + if not isinstance(cell, list): + if isinstance(cell, str): + try: + cell = json.loads(cell) + except Exception: + return None + if not isinstance(cell, list): + return None + elif isinstance(cell, tuple): + cell = list(cell) # type: ignore[arg-type] + else: + return None + return cell # type: ignore[return-value] + + def write_value(self, cell: Any) -> str: + return json.dumps(cell) + + diff --git a/frictionless/fields/datetime.py b/frictionless/fields/datetime.py index 6ac16d20b5..0795c8dfea 100644 --- a/frictionless/fields/datetime.py +++ b/frictionless/fields/datetime.py @@ -1,12 +1,7 @@ from __future__ import annotations -from datetime import datetime -from typing import Any - import attrs -from .. import settings -from ..platform import platform from ..schema import Field @@ -20,44 +15,3 @@ class DatetimeField(Field): "maximum", "enum", ] - - # Read - - # TODO: use different value_readers based on format (see string) - def create_value_reader(self): - # Create reader - def value_reader(cell: Any): - if not isinstance(cell, datetime): - if not isinstance(cell, str): - return None - try: - if self.format == "default": - # Guard against shorter formats supported by dateutil - assert cell[16] == ":" - assert len(cell) >= 19 - cell = platform.dateutil_parser.isoparse(cell) - elif self.format == "any": - cell = platform.dateutil_parser.parse(cell) - else: - cell = datetime.strptime(cell, self.format) - except Exception: - return None - return cell - - return value_reader - - # Write - - def create_value_writer(self): - # Create format - format = self.format - if format == settings.DEFAULT_FIELD_FORMAT: - format = settings.DEFAULT_DATETIME_PATTERN - - # Create writer - def value_writer(cell: Any): - cell = cell.strftime(format) - cell = cell.replace("+0000", "Z") - return cell - - return value_writer diff --git a/frictionless/fields/datetime_descriptor.py b/frictionless/fields/datetime_descriptor.py new file mode 100644 index 0000000000..c2fca77d99 --- /dev/null +++ b/frictionless/fields/datetime_descriptor.py @@ -0,0 +1,44 @@ +import datetime +from typing import Any, Literal, Optional + +from .. import settings +from ..platform import platform +from .base_field_descriptor import BaseFieldDescriptor +from .field_constraints import ValueConstraints + + +class DatetimeFieldDescriptor(BaseFieldDescriptor): + """The field contains a date with a time.""" + + type: Literal["datetime"] = "datetime" + format: Optional[str] = None + constraints: Optional[ValueConstraints[datetime.datetime]] = None + + def read_value(self, cell: Any) -> Optional[datetime.datetime]: + if not isinstance(cell, datetime.datetime): + if not isinstance(cell, str): + return None + try: + format_value = self.format or "default" + if format_value == "default": + # Guard against shorter formats supported by dateutil + assert cell[16] == ":" + assert len(cell) >= 19 + cell = platform.dateutil_parser.isoparse(cell) + elif format_value == "any": + cell = platform.dateutil_parser.parse(cell) + else: + cell = datetime.datetime.strptime(cell, format_value) + except Exception: + return None + return cell + + def write_value(self, cell: Optional[datetime.datetime]) -> Optional[str]: + if cell is None: + return None + format_value = self.format or "default" + if format_value == settings.DEFAULT_FIELD_FORMAT: + format_value = settings.DEFAULT_DATETIME_PATTERN + result = cell.strftime(format_value) + result = result.replace("+0000", "Z") + return result diff --git a/frictionless/fields/duration.py b/frictionless/fields/duration.py index f0ddc61451..d641442c4b 100644 --- a/frictionless/fields/duration.py +++ b/frictionless/fields/duration.py @@ -1,11 +1,7 @@ from __future__ import annotations -import datetime -from typing import Any - import attrs -from ..platform import platform from ..schema import Field @@ -17,28 +13,3 @@ class DurationField(Field): "required", "enum", ] - - # Read - - def create_value_reader(self): - # Create reader - def value_reader(cell: Any): - if not isinstance(cell, (platform.isodate.Duration, datetime.timedelta)): # type: ignore - if not isinstance(cell, str): - return None - try: - cell = platform.isodate.parse_duration(cell) # type: ignore - except Exception: - return None - return cell - - return value_reader - - # Write - - def create_value_writer(self): - # Create writer - def value_writer(cell: Any): # type: ignore - return platform.isodate.duration_isoformat(cell) # type: ignore - - return value_writer diff --git a/frictionless/fields/duration_descriptor.py b/frictionless/fields/duration_descriptor.py new file mode 100644 index 0000000000..0215af3c6e --- /dev/null +++ b/frictionless/fields/duration_descriptor.py @@ -0,0 +1,31 @@ +import datetime +from typing import Any, Literal, Optional + +from ..platform import platform +from .base_field_descriptor import BaseFieldDescriptor +from .field_constraints import ValueConstraints + + +class DurationFieldDescriptor(BaseFieldDescriptor): + """The field contains a duration of time.""" + + type: Literal["duration"] = "duration" + format: Optional[Literal["default"]] = None + constraints: Optional[ValueConstraints[str]] = None + + def read_value(self, cell: Any) -> Any: + if not isinstance(cell, (platform.isodate.Duration, datetime.timedelta)): # type: ignore + if not isinstance(cell, str): + return None + try: + cell = platform.isodate.parse_duration(cell) # type: ignore + except Exception: + return None + return cell + + def write_value(self, cell: Any) -> Optional[str]: + if cell is None: + return None + return platform.isodate.duration_isoformat(cell) # type: ignore + + diff --git a/frictionless/fields/field_descriptor.py b/frictionless/fields/field_descriptor.py index 32b9d95e9c..4841ad7ff5 100644 --- a/frictionless/fields/field_descriptor.py +++ b/frictionless/fields/field_descriptor.py @@ -1,95 +1,27 @@ from __future__ import annotations -import datetime -from typing import List, Literal, Optional, Union +from typing import Literal, Optional, Union -from pydantic import Field as PydanticField, BaseModel +from pydantic import Field as PydanticField from .base_field_descriptor import BaseFieldDescriptor -from .field_constraints import ( - BaseConstraints, - CollectionConstraints, - JSONConstraints, - StringConstraints, - ValueConstraints, -) +from .field_constraints import CollectionConstraints +from .any_descriptor import AnyFieldDescriptor +from .array_descriptor import ArrayFieldDescriptor from .boolean_descriptor import BooleanFieldDescriptor from .date_descriptor import DateFieldDescriptor +from .datetime_descriptor import DatetimeFieldDescriptor +from .duration_descriptor import DurationFieldDescriptor +from .geojson_descriptor import GeoJSONFieldDescriptor +from .geopoint_descriptor import GeoPointFieldDescriptor from .integer_descriptor import IntegerFieldDescriptor - - -class ArrayFieldDescriptor(BaseFieldDescriptor): - """The field contains a valid JSON array.""" - - type: Literal["array"] = "array" - format: Optional[Literal["default"]] = None - constraints: Optional[JSONConstraints] = None - - # TODO type is not accurate : array item are unnamed, not described etc - # Using string annotation to avoid circular import - array_item: Optional["FieldDescriptor"] = PydanticField( - default=None, alias="arrayItem" - ) - - -class AnyFieldDescriptor(BaseFieldDescriptor): - """The field contains values of a unspecified or mixed type.""" - - type: Literal["any"] = "any" - format: Optional[Literal["default"]] = None - constraints: Optional[BaseConstraints[str]] = None - - -class DatetimeFieldDescriptor(BaseFieldDescriptor): - """The field contains a date with a time.""" - - type: Literal["datetime"] = "datetime" - format: Optional[str] = None - constraints: Optional[ValueConstraints[datetime.datetime]] = None - - -class DurationFieldDescriptor(BaseFieldDescriptor): - """The field contains a duration of time.""" - - type: Literal["duration"] = "duration" - format: Optional[Literal["default"]] = None - constraints: Optional[ValueConstraints[str]] = None - - -IGeojsonFormat = Literal[ - "default", - "topojson", -] - - -class GeoJSONFieldDescriptor(BaseFieldDescriptor): - """The field contains a JSON object according to GeoJSON or TopoJSON spec.""" - - type: Literal["geojson"] = "geojson" - format: Optional[IGeojsonFormat] = None - constraints: Optional[BaseConstraints[str]] = None - - -class GeoPointFieldDescriptor(BaseFieldDescriptor): - """The field contains data describing a geographic point.""" - - type: Literal["geopoint"] = "geopoint" - format: Optional[IGeojsonFormat] = None - constraints: Optional[BaseConstraints[str]] = None - - -class CategoryDict(BaseModel): - """Category dictionary for field categories.""" - value: str - label: Optional[str] = None - - -ICategories = Union[ - List[str], - List[CategoryDict], -] -"""Categories type used by IntegerFieldDescriptor and StringFieldDescriptor""" +from .number_descriptor import NumberFieldDescriptor +from .object_descriptor import ObjectFieldDescriptor +from .string_descriptor import StringFieldDescriptor +from .time_descriptor import TimeFieldDescriptor +from .year_descriptor import YearFieldDescriptor +from .yearmonth_descriptor import YearmonthFieldDescriptor IItemType = Literal[ @@ -102,7 +34,7 @@ class CategoryDict(BaseModel): "time", ] - +# TODO: why is this not implemented? class ListFieldDescriptor(BaseFieldDescriptor): """The field contains data that is an ordered one-level depth collection of primitive values with a fixed item type. @@ -125,101 +57,15 @@ class ListFieldDescriptor(BaseFieldDescriptor): """ -class NumberFieldDescriptor(BaseFieldDescriptor): - """The field contains numbers of any kind including decimals.""" - - type: Literal["number"] = "number" - format: Optional[Literal["default"]] = None - constraints: Optional[ValueConstraints[float]] = None - - decimal_char: Optional[str] = PydanticField(default=None, alias="decimalChar") - """ - String whose value is used to represent a decimal point for number fields - """ - - group_char: Optional[str] = PydanticField(default=None, alias="groupChar") - """ - String whose value is used to group digits for integer/number fields - """ - - bare_number: Optional[bool] = PydanticField(default=None, alias="bareNumber") - """ - If false leading and trailing non numbers will be removed for integer/number fields - """ - - -class ObjectFieldDescriptor(BaseFieldDescriptor): - """The field contains a valid JSON object.""" - - type: Literal["object"] = "object" - format: Optional[Literal["default"]] = None - constraints: Optional[JSONConstraints] = None - - -IStringFormat = Literal[ - "binary", - "default", - "email", - "uri", - "uuid", - # Unofficial - "wkt", -] - - -class StringFieldDescriptor(BaseFieldDescriptor): - """The field contains strings, that is, sequences of characters.""" - - type: Literal["string"] = "string" - format: Optional[IStringFormat] = None - constraints: StringConstraints = PydanticField(default_factory=StringConstraints) - - categories: Optional[ICategories] = None - """ - Property to restrict the field to a finite set of possible values - """ - - categoriesOrdered: Optional[bool] = None - """ - When categoriesOrdered is true, implementations SHOULD regard the order of - appearance of the values in the categories property as their natural order. - """ - - -class TimeFieldDescriptor(BaseFieldDescriptor): - """The field contains a time without a date.""" - - type: Literal["time"] = "time" - format: Optional[str] = None - constraints: Optional[ValueConstraints[datetime.time]] = None - - -class YearFieldDescriptor(BaseFieldDescriptor): - """The field contains a calendar year.""" - - type: Literal["year"] = "year" - format: Optional[Literal["default"]] = None - constraints: Optional[ValueConstraints[int]] = None - - -class YearmonthFieldDescriptor(BaseFieldDescriptor): - """The field contains a specific month of a specific year.""" - - type: Literal["yearmonth"] = "yearmonth" - format: Optional[Literal["default"]] = None - constraints: Optional[ValueConstraints[str]] = None - -FieldDescriptor = Union[ +FieldDescriptorNoArrayOrList = Union[ AnyFieldDescriptor, - ArrayFieldDescriptor, # wip - BooleanFieldDescriptor, # v - DateFieldDescriptor, # v + BooleanFieldDescriptor, + DateFieldDescriptor, DatetimeFieldDescriptor, DurationFieldDescriptor, GeoJSONFieldDescriptor, GeoPointFieldDescriptor, - IntegerFieldDescriptor, # v - ListFieldDescriptor, + IntegerFieldDescriptor, NumberFieldDescriptor, ObjectFieldDescriptor, StringFieldDescriptor, @@ -227,3 +73,10 @@ class YearmonthFieldDescriptor(BaseFieldDescriptor): YearFieldDescriptor, YearmonthFieldDescriptor, ] + +# Recursive field descriptors (reference FieldDescriptor itself) +FieldDescriptor = Union[ + FieldDescriptorNoArrayOrList, + ArrayFieldDescriptor, + ListFieldDescriptor, +] diff --git a/frictionless/fields/geojson.py b/frictionless/fields/geojson.py index 488421e2b3..12299ee23e 100644 --- a/frictionless/fields/geojson.py +++ b/frictionless/fields/geojson.py @@ -1,12 +1,7 @@ from __future__ import annotations -import json -from typing import Any, Dict, cast - import attrs -from .. import settings -from ..platform import platform from ..schema import Field @@ -18,53 +13,3 @@ class GeojsonField(Field): "required", "enum", ] - - # Read - - # TODO: use different value_readers based on format (see string) - def create_value_reader(self): - validator_for = platform.jsonschema_validators.validator_for # type: ignore - validators = { # type: ignore - "default": validator_for(settings.GEOJSON_PROFILE)(settings.GEOJSON_PROFILE), - "topojson": validator_for(settings.TOPOJSON_PROFILE)( - settings.TOPOJSON_PROFILE - ), - } - - # Create reader - def value_reader(cell: Any): - if isinstance(cell, str): - try: - cell = json.loads(cell) - except Exception: - return None - if not isinstance(cell, dict): - return None - if self.format in ["default", "topojson"]: - try: - validators[self.format].validate(cell) # type: ignore - except Exception: - return None - return cast(Dict[str, Any], cell) - - return value_reader - - # Write - - def create_value_writer(self): - # Create writer - def value_writer(cell: Any): - return json.dumps(cell) - - return value_writer - - # Metadata - - metadata_profile_patch = { - "properties": { - "format": { - "type": "string", - "enum": ["default", "topojson"], - }, - } - } diff --git a/frictionless/fields/geojson_descriptor.py b/frictionless/fields/geojson_descriptor.py new file mode 100644 index 0000000000..918b1a1466 --- /dev/null +++ b/frictionless/fields/geojson_descriptor.py @@ -0,0 +1,44 @@ +import json +from typing import Any, Dict, Literal, Optional, cast + +from .. import settings +from ..platform import platform +from .base_field_descriptor import BaseFieldDescriptor +from .field_constraints import BaseConstraints + + +class GeoJSONFieldDescriptor(BaseFieldDescriptor): + """The field contains a JSON object according to GeoJSON or TopoJSON spec.""" + + type: Literal["geojson"] = "geojson" + format: Optional[Literal["default", "topojson"]] = None + constraints: Optional[BaseConstraints[str]] = None + + def read_value(self, cell: Any) -> Optional[Dict[str, Any]]: + validator_for = platform.jsonschema_validators.validator_for # type: ignore + validators = { # type: ignore + "default": validator_for(settings.GEOJSON_PROFILE)(settings.GEOJSON_PROFILE), + "topojson": validator_for(settings.TOPOJSON_PROFILE)( + settings.TOPOJSON_PROFILE + ), + } + + if isinstance(cell, str): + try: + cell = json.loads(cell) + except Exception: + return None + if not isinstance(cell, dict): + return None + if self.format in ["default", "topojson"]: + try: + validators[self.format].validate(cell) # type: ignore + except Exception: + return None + return cast(Dict[str, Any], cell) + + def write_value(self, cell: Any) -> Optional[str]: + if cell is None: + return None + return json.dumps(cell) + diff --git a/frictionless/fields/geopoint.py b/frictionless/fields/geopoint.py index 46a1435350..794de21fce 100644 --- a/frictionless/fields/geopoint.py +++ b/frictionless/fields/geopoint.py @@ -1,9 +1,5 @@ from __future__ import annotations -import json -from decimal import Decimal -from typing import Any, NamedTuple - import attrs from ..schema import Field @@ -17,65 +13,3 @@ class GeopointField(Field): "required", "enum", ] - - # Read - - def create_value_reader(self): - # Create reader - def value_reader(cell: Any): - # Parse - if isinstance(cell, str): - try: - if self.format == "default": - lon, lat = cell.split(",") - lon = lon.strip() - lat = lat.strip() - elif self.format == "array": - lon, lat = json.loads(cell) - elif self.format == "object": - cell = json.loads(cell) - if len(cell) != 2: - return None - lon = cell["lon"] - lat = cell["lat"] - cell = geopoint(Decimal(lon), Decimal(lat)) # type: ignore - except Exception: - return None - - # Validate - try: - cell = geopoint(*cell) - if cell.lon > 180 or cell.lon < -180: - return None - if cell.lat > 90 or cell.lat < -90: - return None - except Exception: - return None - - return cell - - return value_reader - - # Write - - def create_value_writer(self): - # Create writer - def value_writer(cell: Any): - if self.format == "array": - return json.dumps(list(cell)) - elif self.format == "object": - return json.dumps({"lon": cell.lon, "lat": cell.lat}) - return ",".join(map(str, cell)) - - return value_writer - - -# Internal - - -class geopoint(NamedTuple): - lon: int - lat: int - - def __repr__(self): - return str([float(self[0]), float(self[1])]) diff --git a/frictionless/fields/geopoint_descriptor.py b/frictionless/fields/geopoint_descriptor.py new file mode 100644 index 0000000000..872bdb3f5d --- /dev/null +++ b/frictionless/fields/geopoint_descriptor.py @@ -0,0 +1,66 @@ +import json +from decimal import Decimal +from typing import Any, Literal, NamedTuple, Optional + +from .base_field_descriptor import BaseFieldDescriptor +from .field_constraints import BaseConstraints + + +class geopoint(NamedTuple): + """Internal representation of a geographic point""" + lon: Decimal + lat: Decimal + + def __repr__(self): + return str([float(self[0]), float(self[1])]) + + +class GeoPointFieldDescriptor(BaseFieldDescriptor): + """The field contains data describing a geographic point.""" + + type: Literal["geopoint"] = "geopoint" + format: Optional[Literal["default", "array", "object"]] = None + constraints: Optional[BaseConstraints[str]] = None + + def read_value(self, cell: Any) -> Optional[geopoint]: + # Parse + if isinstance(cell, str): + try: + if self.format == "default" or self.format is None: + lon, lat = cell.split(",") + lon = lon.strip() + lat = lat.strip() + elif self.format == "array": + lon, lat = json.loads(cell) + elif self.format == "object": + cell = json.loads(cell) + if len(cell) != 2: + return None + lon = cell["lon"] + lat = cell["lat"] + cell = geopoint(Decimal(lon), Decimal(lat)) # type: ignore + except Exception: + return None + + # Validate + try: + cell = geopoint(*cell) + if cell.lon > 180 or cell.lon < -180: + return None + if cell.lat > 90 or cell.lat < -90: + return None + except Exception: + return None + + return cell + + def write_value(self, cell: Any) -> Optional[str]: + if cell is None: + return None + format_value = self.format or "default" + if format_value == "array": + return json.dumps(list(cell)) + elif format_value == "object": + return json.dumps({"lon": cell.lon, "lat": cell.lat}) + return ",".join(map(str, cell)) + diff --git a/frictionless/fields/number.py b/frictionless/fields/number.py index 39b11f70c1..6af9291659 100644 --- a/frictionless/fields/number.py +++ b/frictionless/fields/number.py @@ -1,12 +1,8 @@ from __future__ import annotations -import re -from decimal import Decimal -from typing import Any - import attrs +from typing import Optional -from .. import settings from ..schema import Field @@ -20,113 +16,9 @@ class NumberField(Field): "maximum", "enum", ] + decimal_char: Optional[str] = None + group_char: Optional[str] = None + bare_number: Optional[bool] = None + float_number: Optional[bool] = None - bare_number: bool = settings.DEFAULT_BARE_NUMBER - """ - It specifies that the value is a bare number. If true, the pattern to remove non digit - character does not get applied and vice versa. The default value is True. - """ - - float_number: bool = settings.DEFAULT_FLOAT_NUMBER - """ - It specifies that the value is a float number. - """ - - decimal_char: str = settings.DEFAULT_DECIMAL_CHAR - """ - It specifies the char to be used as decimal character. The default - value is ".". It values can be: ".", "@" etc. - """ - - group_char: str = settings.DEFAULT_GROUP_CHAR - """ - It specifies the char to be used as group character. The default value - is "". It can take values such as: ",", "#" etc. - """ - - # Read - - def create_value_reader(self): - # Create pattern - pattern = None - if not self.bare_number: - pattern = re.compile(r"((^[^-\d]*)|(\D*$))") - - # Create processor - processor = None - properties = ["group_char", "decimal_char", "bare_number"] - if set(properties).intersection(self.list_defined()): - - def processor_function(cell: Any): - if pattern: - cell = pattern.sub("", cell) - cell = cell.replace(self.group_char, "") - if self.decimal_char != "." and "." in cell: - return None - cell = cell.replace(self.decimal_char, ".") - return cell - - processor = processor_function - - # Create reader - def value_reader(cell: Any): - Primary = Decimal - Secondary = float - if self.float_number: - Primary = float - Secondary = Decimal - if isinstance(cell, str): - cell = cell.strip() - - # Process the cell - if processor: - cell = processor(cell) # type: ignore - if cell is None: - return None - - # Cast the cell - try: - return Primary(cell) # type: ignore - except Exception: - return None - - elif isinstance(cell, Primary): - return cell - elif cell is True or cell is False: - return None - elif isinstance(cell, int): - return cell - elif isinstance(cell, Secondary): - return Primary(str(cell) if Primary is Decimal else cell) - return None - - return value_reader - - # Write - - # TODO: optimize - def create_value_writer(self): - # Create writer - def value_writer(cell: Any): - if self.has_defined("group_char"): - cell = f"{cell:,}".replace(",", "g") - else: - cell = str(cell) - if self.has_defined("decimal_char"): - cell = cell.replace(".", self.decimal_char) - if self.has_defined("group_char"): - cell = cell.replace("g", self.group_char) - return cell - - return value_writer - - # Metadata - metadata_profile_patch = { - "properties": { - "bareNumber": {"type": "boolean"}, - "floatNumber": {"type": "boolean"}, - "decimalChar": {"type": "string"}, - "groupChar": {"type": "string"}, - } - } diff --git a/frictionless/fields/number_descriptor.py b/frictionless/fields/number_descriptor.py new file mode 100644 index 0000000000..e91cb958a6 --- /dev/null +++ b/frictionless/fields/number_descriptor.py @@ -0,0 +1,109 @@ +import re +from decimal import Decimal +from typing import Any, Callable, Literal, Optional, Pattern, Union + +from pydantic import Field as PydanticField + +from .. import settings +from .base_field_descriptor import BaseFieldDescriptor +from .field_constraints import ValueConstraints + + +class NumberFieldDescriptor(BaseFieldDescriptor): + """The field contains numbers of any kind including decimals.""" + + type: Literal["number"] = "number" + format: Optional[Literal["default"]] = None + constraints: Optional[ValueConstraints[Union[int, float]]] = None + + decimal_char: Optional[str] = PydanticField(default=None, alias="decimalChar") + """ + String whose value is used to represent a decimal point for number fields + """ + + group_char: Optional[str] = PydanticField(default=None, alias="groupChar") + """ + String whose value is used to group digits for integer/number fields + """ + + bare_number: Optional[bool] = PydanticField(default=None, alias="bareNumber") + """ + If false leading and trailing non numbers will be removed for integer/number fields + """ + + float_number: Optional[bool] = PydanticField(default=None, alias="floatNumber") + """ + It specifies that the value is a float number. + """ + + def read_value(self, cell: Any) -> Optional[Union[float, Decimal]]: + # Create pattern + pattern: Optional[Pattern[str]] = None + bare_number_value = self.bare_number if self.bare_number is not None else settings.DEFAULT_BARE_NUMBER + if not bare_number_value: + pattern = re.compile(r"((^[^-\d]*)|(\D*$))") + + # Create processor + processor: Optional[Callable[[str], Optional[str]]] = None + decimal_char_value = self.decimal_char if self.decimal_char is not None else settings.DEFAULT_DECIMAL_CHAR + group_char_value = self.group_char if self.group_char is not None else settings.DEFAULT_GROUP_CHAR + + if self.decimal_char is not None or self.group_char is not None or self.bare_number is not None: + def processor_function(cell: str) -> Optional[str]: + if pattern: + cell = pattern.sub("", cell) + cell = cell.replace(group_char_value, "") + if decimal_char_value != "." and "." in cell: + return None + cell = cell.replace(decimal_char_value, ".") + return cell + + processor = processor_function + + # Determine primary and secondary types + Primary = Decimal + Secondary = float + float_number_value = self.float_number if self.float_number is not None else settings.DEFAULT_FLOAT_NUMBER + if float_number_value: + Primary = float + Secondary = Decimal + + if isinstance(cell, str): + cell = cell.strip() + + # Process the cell + if processor: + cell = processor(cell) + if cell is None: + return None + + # Cast the cell + try: + return Primary(cell) # type: ignore + except Exception: + return None + + elif isinstance(cell, Primary): + return cell + elif cell is True or cell is False: + return None + elif isinstance(cell, int): + return cell + elif isinstance(cell, Secondary): + return Primary(str(cell) if Primary is Decimal else cell) + return None + + def write_value(self, cell: Any) -> Optional[str]: + if cell is None: + return None + + if self.group_char is not None: + cell = f"{cell:,}".replace(",", "g") + else: + cell = str(cell) + if self.decimal_char is not None: + cell = cell.replace(".", self.decimal_char) + if self.group_char is not None: + cell = cell.replace("g", self.group_char) + return cell + diff --git a/frictionless/fields/object.py b/frictionless/fields/object.py index a7e947ef12..43d4def9f7 100644 --- a/frictionless/fields/object.py +++ b/frictionless/fields/object.py @@ -1,8 +1,5 @@ from __future__ import annotations -import json -from typing import Any, Dict, cast - import attrs from ..schema import Field @@ -18,30 +15,3 @@ class ObjectField(Field): "maxLength", "enum", ] - - # Read - - def create_value_reader(self): - # Create reader - def value_reader(cell: Any): - if not isinstance(cell, dict): - if not isinstance(cell, str): - return None - try: - cell = json.loads(cell) - except Exception: - return None - if not isinstance(cell, dict): - return None - return cast(Dict[str, Any], cell) - - return value_reader - - # Write - - def create_value_writer(self): - # Create writer - def value_writer(cell: Any): - return json.dumps(cell) - - return value_writer diff --git a/frictionless/fields/object_descriptor.py b/frictionless/fields/object_descriptor.py new file mode 100644 index 0000000000..13a2f64a3d --- /dev/null +++ b/frictionless/fields/object_descriptor.py @@ -0,0 +1,31 @@ +import json +from typing import Any, Dict, Literal, Optional, cast + +from .base_field_descriptor import BaseFieldDescriptor +from .field_constraints import JSONConstraints + + +class ObjectFieldDescriptor(BaseFieldDescriptor): + """The field contains a valid JSON object.""" + + type: Literal["object"] = "object" + format: Optional[Literal["default"]] = None + constraints: Optional[JSONConstraints] = None + + def read_value(self, cell: Any) -> Optional[Dict[str, Any]]: + if not isinstance(cell, dict): + if not isinstance(cell, str): + return None + try: + cell = json.loads(cell) + except Exception: + return None + if not isinstance(cell, dict): + return None + return cast(Dict[str, Any], cell) + + def write_value(self, cell: Any) -> Optional[str]: + if cell is None: + return None + return json.dumps(cell) + diff --git a/frictionless/fields/string.py b/frictionless/fields/string.py index 3fb4eeec5e..981c6b8274 100644 --- a/frictionless/fields/string.py +++ b/frictionless/fields/string.py @@ -1,11 +1,7 @@ from __future__ import annotations -import base64 -from typing import Any - import attrs -from ..platform import platform from ..schema import Field @@ -20,97 +16,3 @@ class StringField(Field): "pattern", "enum", ] - - # Read - - def create_value_reader(self): - # Uri - if self.format == "uri": - - def value_reader(cell: Any): - if not isinstance(cell, str): - return None - uri_validator = platform.rfc3986.validators.Validator() # type: ignore - uri_validator.require_presence_of("scheme") # type: ignore - uri = platform.rfc3986.uri_reference(cell) # type: ignore - try: - uri_validator.validate(uri) # type: ignore - except platform.rfc3986.exceptions.ValidationError: # type: ignore - return None - return cell - - # Email - elif self.format == "email": - - def value_reader(cell: Any): - if not isinstance(cell, str): - return None - if not platform.validators.email(cell): # type: ignore - return None - return cell - - # Uuid - elif self.format == "uuid": - - def value_reader(cell: Any): - if not isinstance(cell, str): - return None - if not platform.validators.uuid(cell): # type: ignore - return None - return cell - - # Binary - elif self.format == "binary": - - def value_reader(cell: Any): - if not isinstance(cell, str): - return None - try: - base64.b64decode(cell) - except Exception: - return None - return cell - - # WKT - elif self.format == "wkt": - parser = platform.wkt.Parser() - - def value_reader(cell: Any): - if not isinstance(cell, str): - return None - try: - parser.parse(cell) - except Exception: - return None - return cell - - # Default - else: - - def value_reader(cell: Any): - if not isinstance(cell, str): - return None - - return cell - - return value_reader - - # Write - - def create_value_writer(self): - # Create writer - def value_writer(cell: Any): - return str(cell) - - return value_writer - - # Metadata - - metadata_profile_patch = { - "properties": { - "format": { - "type": "string", - "enum": ["default", "email", "uri", "binary", "uuid", "wkt"], - }, - } - } diff --git a/frictionless/fields/string_descriptor.py b/frictionless/fields/string_descriptor.py new file mode 100644 index 0000000000..f0034c8e72 --- /dev/null +++ b/frictionless/fields/string_descriptor.py @@ -0,0 +1,102 @@ +import base64 +from typing import Any, Literal, Optional, Union, List + +from pydantic import Field as PydanticField, BaseModel +from ..platform import platform +from .base_field_descriptor import BaseFieldDescriptor +from .field_constraints import StringConstraints + +class CategoryDict(BaseModel): + """Category dictionary for field categories.""" + value: str + label: Optional[str] = None + + +ICategories = Union[ + List[str], + List[CategoryDict], +] +"""Categories type used by IntegerFieldDescriptor and StringFieldDescriptor""" +class StringFieldDescriptor(BaseFieldDescriptor): + """The field contains strings, that is, sequences of characters.""" + + type: Literal["string"] = "string" + format: Optional[Literal["default", "binary", "email", "uri", "uuid", "wkt"]] = None + constraints: StringConstraints = PydanticField(default_factory=StringConstraints) + + categories: Optional[ICategories] = None + """ + Property to restrict the field to a finite set of possible values + """ + + categoriesOrdered: Optional[bool] = PydanticField(default=None, alias="categoriesOrdered") + """ + When categoriesOrdered is true, implementations SHOULD regard the order of + appearance of the values in the categories property as their natural order. + """ + + def read_value(self, cell: Any) -> Optional[str]: + format_value = self.format or "default" + + # Uri + if format_value == "uri": + if not isinstance(cell, str): + return None + uri_validator = platform.rfc3986.validators.Validator() # type: ignore + uri_validator.require_presence_of("scheme") # type: ignore + uri = platform.rfc3986.uri_reference(cell) # type: ignore + try: + uri_validator.validate(uri) # type: ignore + except platform.rfc3986.exceptions.ValidationError: # type: ignore + return None + return cell + + # Email + elif format_value == "email": + if not isinstance(cell, str): + return None + result = platform.validators.email(cell) # type: ignore + if result is True: + return cell + return None + + # Uuid + elif format_value == "uuid": + if not isinstance(cell, str): + return None + if not platform.validators.uuid(cell): # type: ignore + return None + return cell + + # Binary + elif format_value == "binary": + if not isinstance(cell, str): + return None + try: + base64.b64decode(cell) + except Exception: + return None + return cell + + # WKT + elif format_value == "wkt": + parser = platform.wkt.Parser() + if not isinstance(cell, str): + return None + try: + parser.parse(cell) + except Exception: + return None + return cell + + # Default + else: + if not isinstance(cell, str): + return None + return cell + + def write_value(self, cell: Any) -> Optional[str]: + if cell is None: + return None + return str(cell) + diff --git a/frictionless/fields/time.py b/frictionless/fields/time.py index 750ef16f3a..41a5411024 100644 --- a/frictionless/fields/time.py +++ b/frictionless/fields/time.py @@ -1,12 +1,7 @@ from __future__ import annotations -from datetime import datetime, time -from typing import Any - import attrs -from .. import settings -from ..platform import platform from ..schema import Field @@ -20,46 +15,3 @@ class TimeField(Field): "maximum", "enum", ] - - # Read - - # TODO: use different value_readers based on format (see string) - def create_value_reader(self): - # Create reader - def value_reader(cell: Any): - if not isinstance(cell, time): - if not isinstance(cell, str): - return None - try: - if self.format == "default": - # Guard against shorter formats supported by dateutil - assert cell[5] == ":" - assert len(cell) >= 8 - cell = platform.dateutil_parser.isoparse( - f"2000-01-01T{cell}" - ).timetz() - elif self.format == "any": - cell = platform.dateutil_parser.parse(cell).timetz() - else: - cell = datetime.strptime(cell, self.format).timetz() - except Exception: - return None - return cell - - return value_reader - - # Write - - def create_value_writer(self): - # Create format - format = self.format - if format == settings.DEFAULT_FIELD_FORMAT: - format = settings.DEFAULT_TIME_PATTERN - - # Create writer - def value_writer(cell: Any): - cell = cell.strftime(format) - cell = cell.replace("+0000", "Z") - return cell - - return value_writer diff --git a/frictionless/fields/time_descriptor.py b/frictionless/fields/time_descriptor.py new file mode 100644 index 0000000000..98ecba2d51 --- /dev/null +++ b/frictionless/fields/time_descriptor.py @@ -0,0 +1,48 @@ +import datetime +from datetime import time +from typing import Any, Literal, Optional + +from .. import settings +from ..platform import platform +from .base_field_descriptor import BaseFieldDescriptor +from .field_constraints import ValueConstraints + + +class TimeFieldDescriptor(BaseFieldDescriptor): + """The field contains a time without a date.""" + + type: Literal["time"] = "time" + format: Optional[str] = None + constraints: Optional[ValueConstraints[time]] = None + + def read_value(self, cell: Any) -> Optional[time]: + if not isinstance(cell, time): + if not isinstance(cell, str): + return None + try: + format_value = self.format or "default" + if format_value == "default": + # Guard against shorter formats supported by dateutil + assert cell[5] == ":" + assert len(cell) >= 8 + cell = platform.dateutil_parser.isoparse( + f"2000-01-01T{cell}" + ).timetz() + elif format_value == "any": + cell = platform.dateutil_parser.parse(cell).timetz() + else: + cell = datetime.datetime.strptime(cell, format_value).timetz() + except Exception: + return None + return cell + + def write_value(self, cell: Optional[time]) -> Optional[str]: + if cell is None: + return None + format_value = self.format or "default" + if format_value == settings.DEFAULT_FIELD_FORMAT: + format_value = settings.DEFAULT_TIME_PATTERN + result = cell.strftime(format_value) + result = result.replace("+0000", "Z") + return result + diff --git a/frictionless/fields/year.py b/frictionless/fields/year.py index 25a81d4c60..e7be6260b5 100644 --- a/frictionless/fields/year.py +++ b/frictionless/fields/year.py @@ -1,7 +1,5 @@ from __future__ import annotations -from typing import Any - import attrs from ..schema import Field @@ -17,32 +15,3 @@ class YearField(Field): "maximum", "enum", ] - - # Read - - def create_value_reader(self): - # Create reader - def value_reader(cell: Any): - if not isinstance(cell, int): - if not isinstance(cell, str): - return None - if len(cell) != 4: - return None - try: - cell = int(cell) - except Exception: - return None - if cell < 0 or cell > 9999: - return None - return cell - - return value_reader - - # Write - - def create_value_writer(self): - # Create writer - def value_writer(cell: Any): - return str(cell) - - return value_writer diff --git a/frictionless/fields/year_descriptor.py b/frictionless/fields/year_descriptor.py new file mode 100644 index 0000000000..9c364821c1 --- /dev/null +++ b/frictionless/fields/year_descriptor.py @@ -0,0 +1,32 @@ +from typing import Any, Literal, Optional + +from .base_field_descriptor import BaseFieldDescriptor +from .field_constraints import ValueConstraints + + +class YearFieldDescriptor(BaseFieldDescriptor): + """The field contains a calendar year.""" + + type: Literal["year"] = "year" + format: Optional[Literal["default"]] = None + constraints: Optional[ValueConstraints[int]] = None + + def read_value(self, cell: Any) -> Optional[int]: + if not isinstance(cell, int): + if not isinstance(cell, str): + return None + if len(cell) != 4: + return None + try: + cell = int(cell) + except Exception: + return None + if cell < 0 or cell > 9999: + return None + return cell + + def write_value(self, cell: Optional[int]) -> Optional[str]: + if cell is None: + return None + return str(cell) + diff --git a/frictionless/fields/yearmonth.py b/frictionless/fields/yearmonth.py index 2c119e19ce..ed75965925 100644 --- a/frictionless/fields/yearmonth.py +++ b/frictionless/fields/yearmonth.py @@ -1,7 +1,5 @@ from __future__ import annotations -from typing import Any, NamedTuple - import attrs from ..schema import Field @@ -17,45 +15,3 @@ class YearmonthField(Field): "maximum", "enum", ] - - # Read - - def create_value_reader(self): - # Create reader - def value_reader(cell: Any): - if isinstance(cell, (tuple, list)): - if len(cell) != 2: # type: ignore - return None - cell = yearmonth(year=cell[0], month=cell[1]) # type: ignore - elif isinstance(cell, str): - try: - year, month = cell.split("-") - year = int(year) - month = int(month) - if month < 1 or month > 12: - return None - cell = yearmonth(year, month) - except Exception: - return None - else: - return None - return cell - - return value_reader - - # Write - - def create_value_writer(self): - # Create writer - def value_writer(cell: Any): - return f"{cell.year}-{cell.month:02}" - - return value_writer - - -# Internal - - -class yearmonth(NamedTuple): - year: int - month: int diff --git a/frictionless/fields/yearmonth_descriptor.py b/frictionless/fields/yearmonth_descriptor.py new file mode 100644 index 0000000000..284880b1c1 --- /dev/null +++ b/frictionless/fields/yearmonth_descriptor.py @@ -0,0 +1,43 @@ +from typing import Any, Literal, NamedTuple, Optional + +from .base_field_descriptor import BaseFieldDescriptor +from .field_constraints import ValueConstraints + + +class yearmonth(NamedTuple): + """Internal representation of a year-month""" + year: int + month: int + + +class YearmonthFieldDescriptor(BaseFieldDescriptor): + """The field contains a specific month of a specific year.""" + + type: Literal["yearmonth"] = "yearmonth" + format: Optional[Literal["default"]] = None + constraints: Optional[ValueConstraints[str]] = None + + def read_value(self, cell: Any) -> Optional[yearmonth]: + if isinstance(cell, (tuple, list)): + if len(cell) != 2: # type: ignore + return None + cell = yearmonth(year=cell[0], month=cell[1]) # type: ignore + elif isinstance(cell, str): + try: + year, month = cell.split("-") + year = int(year) + month = int(month) + if month < 1 or month > 12: + return None + cell = yearmonth(year, month) + except Exception: + return None + else: + return None + return cell + + def write_value(self, cell: Any) -> Optional[str]: + if cell is None: + return None + return f"{cell.year}-{cell.month:02}" + diff --git a/frictionless/formats/jsonschema/mapper.py b/frictionless/formats/jsonschema/mapper.py index d6d11c3846..dc8b7e2c69 100644 --- a/frictionless/formats/jsonschema/mapper.py +++ b/frictionless/formats/jsonschema/mapper.py @@ -27,14 +27,16 @@ def read_schema(self, profile: Dict[str, Any]) -> Schema: # type: ignore # Field assert isinstance(name, str) assert isinstance(prop, dict) - field = Field.from_descriptor({"type": type, "name": name}) - schema.add_field(field) - + field_descriptor = {"type": type, "name": name} + # Description description = prop.get("description") # type: ignore if description: assert isinstance(description, str) - field.description = description + field_descriptor["description"] = description + + field = Field.from_descriptor(field_descriptor) + schema.add_field(field) # Required if name in required: diff --git a/frictionless/schema/field.py b/frictionless/schema/field.py index 4114bedef5..80c60958b5 100644 --- a/frictionless/schema/field.py +++ b/frictionless/schema/field.py @@ -4,17 +4,35 @@ import decimal import re from functools import partial -from typing import TYPE_CHECKING, Any, Callable, ClassVar, Dict, List, Optional, Pattern +from typing import TYPE_CHECKING, Any, Callable, ClassVar, Dict, List, Optional, Pattern, Type import attrs import pydantic +from pydantic import BaseModel from .. import errors, settings from ..exception import FrictionlessException # from ..fields.boolean_descriptor import BooleanFieldDescriptor # from ..fields.date_descriptor import DateFieldDescriptor # from ..fields.integer_descriptor import IntegerFieldDescriptor -from ..fields.field_descriptor import BooleanFieldDescriptor, DateFieldDescriptor, IntegerFieldDescriptor, FieldDescriptor +from ..fields.field_descriptor import ( + AnyFieldDescriptor, + ArrayFieldDescriptor, + BooleanFieldDescriptor, + DateFieldDescriptor, + DatetimeFieldDescriptor, + DurationFieldDescriptor, + FieldDescriptor, + GeoJSONFieldDescriptor, + GeoPointFieldDescriptor, + IntegerFieldDescriptor, + NumberFieldDescriptor, + ObjectFieldDescriptor, + StringFieldDescriptor, + TimeFieldDescriptor, + YearFieldDescriptor, + YearmonthFieldDescriptor, +) from ..metadata import Metadata from ..system import system @@ -23,6 +41,40 @@ from . import types from .schema import Schema +# Mapping from field type to its corresponding descriptor class +TYPE_TO_DESCRIPTOR: Dict[str, Type[BaseModel]] = { + "any": AnyFieldDescriptor, + "array": ArrayFieldDescriptor, + "boolean": BooleanFieldDescriptor, + "date": DateFieldDescriptor, + "datetime": DatetimeFieldDescriptor, + "duration": DurationFieldDescriptor, + "geojson": GeoJSONFieldDescriptor, + "geopoint": GeoPointFieldDescriptor, + "integer": IntegerFieldDescriptor, + "number": NumberFieldDescriptor, + "object": ObjectFieldDescriptor, + "string": StringFieldDescriptor, + "time": TimeFieldDescriptor, + "year": YearFieldDescriptor, + "yearmonth": YearmonthFieldDescriptor, +} + +# Descriptor integration (temporary, during Field refactor) +# Used at two points: +# - Sync (runtime): when a Field attribute changes, update the pydantic _descriptor so read_cell/write_cell use up-to-date parsing logic (e.g. format="email"). +# - Init (validation): when creating _descriptor, we pass a dict using Frictionless descriptor keys (camelCase aliases) +DESCRIPTOR_INIT_ALIASES: Dict[str, str] = { + "format": "format", + "decimal_char": "decimalChar", + "group_char": "groupChar", + "bare_number": "bareNumber", + "float_number": "floatNumber", + "true_values": "trueValues", + "false_values": "falseValues", +} + +DESCRIPTOR_SYNC_ATTRS: set[str] = { *DESCRIPTOR_INIT_ALIASES.keys() } @attrs.define(kw_only=True, repr=False) class Field(Metadata): @@ -94,11 +146,83 @@ class Field(Metadata): List of supported constraints for a field. """ + # All optional fields for the field descriptor + decimal_char: Optional[str] = None + group_char: Optional[str] = None + bare_number: Optional[bool] = None + float_number: Optional[bool] = None + true_values: Optional[List[str]] = None + false_values: Optional[List[str]] = None + + def __attrs_post_init__(self): + self._init_descriptor_from_field() + def __setattr__(self, name: str, value: Any): # type: ignore if name == "type": note = 'Use "schema.set_field_type()" to update the type of the field' raise FrictionlessException(errors.FieldError(note=note)) - return super().__setattr__(name, value) # type: ignore + + result = super().__setattr__(name, value) # type: ignore + + self._sync_descriptor_property(name, value) + + return result + + def _sync_descriptor_property(self, name: str, value: Any) -> None: + """Keep the internal pydantic descriptor in sync with Field attribute assignments.""" + if name not in DESCRIPTOR_SYNC_ATTRS: + return + + if name == "format" and isinstance(value, str): + # Don't sync implicit default format into pydantic, so that it doesnt become "set" and get exported by "model_dump(exclude_unset=True)". + if not self._should_include_format(): + return + + if self._descriptor is None and hasattr(self, "type") and self.type: + self._init_descriptor_from_field() + + if self._descriptor and hasattr(self._descriptor, name): + setattr(self._descriptor, name, value) + + def _init_descriptor_from_field(self) -> None: + """Initialize _descriptor from Field properties if not already set + Use camelCase keys for descriptor init (as per Frictionless descriptor keys) + """ + if self._descriptor is not None: + return + + if not hasattr(self, "type") or not self.type: + return + + descriptor_class = TYPE_TO_DESCRIPTOR.get(self.type) + if not descriptor_class: + return + + descriptor_dict: Dict[str, Any] = { + "name": self.name, + "type": self.type, + } + + for attr, alias in DESCRIPTOR_INIT_ALIASES.items(): + if attr == "format": + if self._should_include_format(): + descriptor_dict["format"] = self.format + continue + value = getattr(self, attr, None) + if value is not None: + descriptor_dict[alias] = value + + try: + self._descriptor = descriptor_class.model_validate(descriptor_dict) # type: ignore + except pydantic.ValidationError: + self._descriptor = None + + def _should_include_format(self) -> bool: + """Whether `format` should be considered set for descriptor/init/sync purposes.""" + fmt = getattr(self, "format", None) + if not isinstance(fmt, str) or not fmt: + return False + return self.has_defined("format") or fmt != settings.DEFAULT_FIELD_FORMAT @property def required(self): @@ -159,13 +283,9 @@ def cell_reader(cell: Any): def create_value_reader(self) -> types.IValueReader: # Create reader - def value_reader(cell: Any): - if self._descriptor and isinstance(self._descriptor, BooleanFieldDescriptor): - return self._descriptor.read_value(cell) - if self._descriptor and isinstance(self._descriptor, IntegerFieldDescriptor): - return self._descriptor.read_value(cell) - if self._descriptor and isinstance(self._descriptor, DateFieldDescriptor): - return self._descriptor.read_value(cell) + def value_reader(cell: Any) -> Any: + if self._descriptor: + return self._descriptor.read_value(cell) # type: ignore return cell return value_reader @@ -203,14 +323,10 @@ def cell_writer(cell: Any, *, ignore_missing: bool = False): def create_value_writer(self) -> types.IValueWriter: # Create writer - def value_writer(cell: Any): - if self._descriptor and isinstance(self._descriptor, BooleanFieldDescriptor): - return self._descriptor.write_value(cell) - if self._descriptor and isinstance(self._descriptor, IntegerFieldDescriptor): - return self._descriptor.write_value(cell) - if self._descriptor and isinstance(self._descriptor, DateFieldDescriptor): - return self._descriptor.write_value(cell) - return str(cell) + def value_writer(cell: Any) -> Any: + if self._descriptor: + return self._descriptor.write_value(cell) # type: ignore + return cell return value_writer @@ -276,30 +392,23 @@ def metadata_import( with_basepath=with_basepath, ) - if field.type == "boolean": - try: - field._descriptor = BooleanFieldDescriptor.model_validate(descriptor_copy) - except pydantic.ValidationError as ve: - error = errors.SchemaError(note=str(ve)) - raise FrictionlessException(error) - elif field.type == "integer": - try: - field._descriptor = IntegerFieldDescriptor.model_validate(descriptor_copy) - except pydantic.ValidationError as ve: - error = errors.SchemaError(note=str(ve)) - raise FrictionlessException(error) - elif field.type == "date": + # Get the descriptor class for this field type + field_type = field.type + DescriptorClass = TYPE_TO_DESCRIPTOR.get(field_type) if field_type else None + + if DescriptorClass: try: - field._descriptor = DateFieldDescriptor.model_validate(descriptor_copy) + field._descriptor = DescriptorClass.model_validate(descriptor_copy) # type: ignore except pydantic.ValidationError as ve: - error = errors.SchemaError(note=str(ve)) - raise FrictionlessException(error) + # Temporary: Handle Pydantic validation errors + # TODO: Remove once Pydantic validation is properly integrated + handle_pydantic_error_for_import(ve) return field def to_descriptor(self, *, validate: bool = False) -> IDescriptor: if self._descriptor and isinstance( - self._descriptor, (BooleanFieldDescriptor, IntegerFieldDescriptor, DateFieldDescriptor) + self._descriptor, (AnyFieldDescriptor, BooleanFieldDescriptor, IntegerFieldDescriptor, DateFieldDescriptor, DatetimeFieldDescriptor, DurationFieldDescriptor, GeoJSONFieldDescriptor, GeoPointFieldDescriptor, NumberFieldDescriptor, ObjectFieldDescriptor, StringFieldDescriptor, TimeFieldDescriptor, YearFieldDescriptor, YearmonthFieldDescriptor) ): base_descr = super().to_descriptor(validate=validate) # Set by_alias=True to get camelCase keys used by Frictionless (bareNumber) instead of snake_case (bare_number) @@ -331,20 +440,28 @@ def metadata_validate(cls, descriptor: IDescriptor): # type: ignore # Examples example = descriptor.get("example") if example: + # Validate descriptor with Pydantic before continuing + # This catches Pydantic validation errors (e.g., invalid example values) type = descriptor.get("type") - Class = system.select_field_class(type) - - field = Class( - name=descriptor.get("name"), # type: ignore - format=descriptor.get("format", "default"), - ) - - if type == "boolean": - # 'example' value must be compared to customized 'trueValues' and 'falseValues' - if "trueValues" in descriptor.keys(): - field.true_values = descriptor["trueValues"] - if "falseValues" in descriptor.keys(): - field.false_values = descriptor["falseValues"] + DescriptorClass = TYPE_TO_DESCRIPTOR.get(type) if type else None + if DescriptorClass: + try: + DescriptorClass.model_validate(descriptor) + except pydantic.ValidationError as ve: + # Temporary: Handle Pydantic validation errors + # TODO: Remove once Pydantic validation is properly integrated + field_errors = handle_pydantic_error_for_validate(ve) + for field_error in field_errors: + yield field_error + return + + # Use metadata_select_class + metadata_import directly (without validation) to avoid recursion + # This properly initializes the field with all properties including + # type-specific ones like trueValues/falseValues for boolean + # We need to pass a copy of the descriptor to avoid modifying the original + Class = Field.metadata_select_class(type) + descriptor_copy = copy.deepcopy(descriptor) + field = Class.metadata_import(descriptor_copy) _, notes = field.read_cell(example) if notes is not None: note = f'example value "{example}" for field "{field.name}" is not valid' @@ -360,6 +477,72 @@ def metadata_validate(cls, descriptor: IDescriptor): # type: ignore # Internal +# Temporary Pydantic error handling functions +# TODO: Remove these once Pydantic validation is properly integrated +# These functions centralize the parsing logic to make future removal easier + +def parse_pydantic_errors(ve: pydantic.ValidationError) -> List[str]: + """Parse Pydantic validation errors into clean error messages. + + This is a temporary function to handle Pydantic ValidationError objects + and convert them to clean error messages by removing Pydantic-specific prefixes. + + Args: + ve: A Pydantic ValidationError + + Returns: + A list of cleaned error messages (with "Value error, " prefix removed) + """ + error_messages: List[str] = [] + for err in ve.errors(): + if "msg" in err: + note: str = str(err["msg"]) + # Remove "Value error, " prefix if present (Pydantic-specific formatting) + note = note.replace("Value error, ", "") + error_messages.append(note) + return error_messages + + +def handle_pydantic_error_for_import(ve: pydantic.ValidationError) -> None: + """Handle Pydantic ValidationError in metadata_import context. + + This is a temporary function that converts Pydantic validation errors + into Frictionless SchemaError exceptions for use during field import. + + Args: + ve: A Pydantic ValidationError + + Raises: + FrictionlessException with a SchemaError containing the first error message + """ + error_messages = parse_pydantic_errors(ve) + + # Use the first error message, or fall back to string representation + if error_messages: + error_note = error_messages[0] + else: + error_note = str(ve) + + error = errors.SchemaError(note=error_note) + raise FrictionlessException(error) + + +def handle_pydantic_error_for_validate(ve: pydantic.ValidationError) -> List[errors.FieldError]: + """Handle Pydantic ValidationError in metadata_validate context. + + This is a temporary function that converts Pydantic validation errors + into Frictionless FieldError objects for use during field validation. + + Args: + ve: A Pydantic ValidationError + + Returns: + A list of FieldError objects, one for each error message + """ + error_messages = parse_pydantic_errors(ve) + return [errors.FieldError(note=note) for note in error_messages] + + def check_required(constraint: bool, cell: Any): if not (constraint and cell is None): return True