Source code for spss_converter.Metadata

from enum import Enum
from typing import Union

from validator_collection import validators, checkers
from spss_converter import errors
from pyreadstat._readstat_parser import metadata_container


class VariableAlignmentEnum(str, Enum):
    UNKNOWN = 'unknown'
    LEFT = 'left'
    CENTER = 'center'
    RIGHT = 'right'


class VariableMeasureEnum(str, Enum):
    UNKNOWN = 'unknown'
    NOMINAL = 'nominal'
    ORDINAL = 'ordinal'
    SCALE = 'scale'


[docs]class ColumnMetadata(object): """Object representation of the :term:`metadata <Metadata>` that describes a column or variable form an SPSS file.""" def __init__(self, **kwargs): self._name = None self._label = None self._value_metadata = None self._missing_range_metadata = None self._missing_value_metadata = None self._alignment = VariableAlignmentEnum.UNKNOWN self._measure = VariableMeasureEnum.UNKNOWN self._display_width = None self._storage_width = None for key in kwargs: value = kwargs.get(key) setattr(self, key, value) @property def name(self): """The name of the column/variable. :rtype: :class:`str <python:str>` / :obj:`None <python:None>` """ return self._name @name.setter def name(self, value): self._name = validators.variable_name(value, allow_empty = True) @property def label(self): """The label applied ot the column/variable. :rtype: :class:`str <python:str>` / :obj:`None <python:None>` """ return self._label @label.setter def label(self, value): self._label = validators.string(value, allow_empty = True) @property def alignment(self): """The alignment to apply to values from this column/variable when displaying data. Defaults to ``'unknown'``. Accepts either ``'unknown'``, ``'left'``, ``'center'``, or ``'right'`` as either a case-insensitive :class:`str <python:str>` or a :class:`VariableAlignmentEnum`. :rtype: :class:`VariableAlignmentEnum` """ return self._alignment @alignment.setter def alignment(self, value): value = validators.string(value, allow_empty = False) value = value.lower() if value not in [member.value for name, member in VariableAlignmentEnum.__members__.items()]: raise ValueError(f'value ({value}) is not a recognized alignment') self._alignment = value @property def measure(self): """A classification of the type of measure (or value type) represented by the variable. Defaults to ``'unknown'``. Accepts either ``'unknown'``, ``'nominal'``, ``'ordinal'``, or ``'scale'``. :rtype: :class:`VariableMeasureEnum` """ return self._measure @measure.setter def measure(self, value): value = validators.string(value, allow_empty = False) value = value.lower() if value not in [member.value for name, member in VariableMeasureEnum.__members__.items()]: raise ValueError(f'value ({value}) is not a recognized measure') self._measure = value @property def display_width(self): """The maximum width at which the value is displayed. Defaults to 0. :rtype: :class:`int <python:int>` """ return self._display_width @display_width.setter def display_width(self, value): self._display_width = validators.integer(value, allow_empty = False, minimum = 0, coerce_value = True) @property def storage_width(self): """The width of data to store in the data file for the value. Defaults to 0. :rytpe: :class:`int <python:int>` """ return self._storage_width @storage_width.setter def storage_width(self, value): self._storage_width = validators.integer(value, allow_empty = False, minimum = 0, coerce_value = True) @property def value_metadata(self): """Collection of values possible for the column/variable, with corresponding labels for each value. :returns: :class:`dict <python:dict>` whose keys are the values in the raw data and whose values are the labels for each value. May be :obj:`None <python:None>` for variables whose value is not coded. :rtype: :class:`dict <python:dict>` / :obj:`None <python:None>` """ return self._value_metadata @value_metadata.setter def value_metadata(self, value): value = validators.dict(value, allow_empty = True) if not value: self._value_metadata = None else: self._value_metadata = { key: validators.string(value[key], allow_empty = True) for key in value } @property def missing_range_metadata(self): """Collection of meta data that defines the numerical ranges that are to be considered missing in the underlying data. :returns: :class:`list <python:list>` of :class:`dict <python:dict>` with keys ``'low'`` and ``'high'`` for the low/high values of the range to apply when raw values are missing (:obj:`None <python:None>`). :rtype: :class:`list <python:list>` of :class:`dict <python:dict>` or :obj:`None <python:None>` """ return self._missing_range_metadata @missing_range_metadata.setter def missing_range_metadata(self, value): value = validators.iterable(value, allow_empty = True) if not value: self._missing_range_metadata = None else: ranges = [validators.dict(x, allow_empty = False) for x in value] validated_ranges = [] for range in ranges: if 'high' not in range or 'low' not in range: raise ValueError('missing_range_metadata requires a "high" and "low"' ' boundary to be defined.') validated_range = { 'high': validators.numeric(range.get('high'), allow_empty = False), 'low': validators.numeric(range.get('low'), allow_empty = False) } validated_ranges.append(validated_range) self._missing_range_metadata = validated_ranges @property def missing_value_metadata(self): """Value used to represent misisng values in the raw data. Defaults to :obj:`None <python:None>`. .. note:: This is not actually relevant for SPSS data, but is an artifact for SAS and SATA data. :rtype: :class:`list <python:list>` of :class:`int <python:int>` or :class:`str <python:str>` / :obj:`None <python:None>` """ return self._missing_value_metadata @missing_value_metadata.setter def missing_value_metadata(self, value): if not value: self._missing_value_metadata = None return elif checkers.is_string(value): value = [value] elif checkers.is_numeric(value): value = [value] validated_values = [] for item in value: try: validated_item = validators.string(item, allow_empty = False) except (ValueError, TypeError): validated_item = validators.int(item, allow_empty = False) validated_values.append(validated_item) self._missing_value_metadata = validated_values
[docs] @classmethod def from_dict(cls, as_dict: dict): """Create a new :class:`ColumnMetadata` instance from a :class:`dict <python:dict>` representation. :param as_dict: The :class:`dict <python:dict>` representation of the :class:`ColumnMetadata`. :type as_dict: :class:`dict <python:dict>` :returns: The :class:`ColumnMetadata` instance. :rtype: :class:`ColumnMetadata` """ instance = cls() instance.name = as_dict.pop('name', instance.name) instance.label = as_dict.pop('label', instance.label) instance.alignment = as_dict.pop('alignment', instance.alignment) instance.measure = as_dict.pop('measure', instance.measure) instance.display_width = as_dict.pop('display_width', instance.display_width) instance.storage_width = as_dict.pop('storage_width', instance.storage_width) instance.value_metadata = as_dict.pop('value_metadata', instance.value_metadata) instance.missing_range_metadata = as_dict.pop('missing_range_metadata', instance.missing_range_metadata) instance.missing_value_metadata = as_dict.pop('missing_value_metadata', instance.missing_value_metadata) return instance
[docs] @classmethod def from_pyreadstat_metadata(cls, name: str, as_metadata): """Create a new :class:`ColumnMetadata` instance from a `Pyreadstat <https://github.com/Roche/pyreadstat/>`_ metadata object. :param name: The name of the variable for which a :class:`ColumnMetadata` instance should be created. :type name: :class:`str <python:str>` :param as_metadata: The `Pyreadstat <https://github.com/Roche/pyreadstat/>`_ metadata object from which the column's metadata should be extracted. :type as_metadata: :class:`Pyreadstat.metadata_container <pyreadstat:_readstat_parser.metadata_container>` :returns: The :class:`ColumnMetadata` instance. :rtype: :class:`ColumnMetadata` """ name = validators.variable_name(name, allow_empty = False) if name not in as_metadata.column_names: raise errors.ColumnNameNotFoundError(f'column name ({name}) not found in ' 'as_metadata') instance = cls(name = name) instance.label = as_metadata.column_names_to_labels.get(name, None) instance.alignment = as_metadata.variable_alignment.get(name, instance.alignment) instance.measure = as_metadata.variable_measure.get(name, instance.measure) instance.display_width = as_metadata.variable_display_width.get(name, instance.display_width) instance.storage_width = as_metadata.variable_storage_width.get(name, instance.storage_width) instance.value_metadata = as_metadata.variable_value_labels.get(name, instance.value_metadata) missing_ranges = as_metadata.missing_ranges.get(name, []) instance.missing_range_metadata = [ { 'low': x.get('lo'), 'high': x.get('hi') } for x in missing_ranges ] instance.missing_value_metadata = as_metadata.missing_user_values.get(name, None) return instance
[docs] def to_dict(self) -> dict: """Generate a :class:`dict <python:dict>` representation of the instance. :rtype: :class:`dict <python:dict>` """ return { 'name': self.name, 'label': self.label, 'alignment': self.alignment, 'measure': self.measure, 'display_width': self.display_width, 'storage_width': self.storage_width, 'value_metadata': self.value_metadata, 'missing_range_metadata': self.missing_range_metadata, 'missing_value_metadata': self.missing_value_metadata }
[docs] def add_to_pyreadstat(self, pyreadstat): """Update ``pyreadstat`` to include the metadata for this column/variable. :param pyreadstat: The `Pyreadstat <https://github.com/Roche/pyreadstat/>`_ metadata object where the :class:`ColumnMetadata` data should be updated. :type pyreadstat: :class:`metadata_container <pyreadstat:_readstat_parser.metadata_container` :returns: The `Pyreadstat <https://github.com/Roche/pyreadstat/>`_ metadata. :rtype: :class:`metadata_container <pyreadstat:_readstat_parser.metadata_container` """ pyreadstat.column_names_to_labels[self.name] = self.label pyreadstat.variable_alignment[self.name] = str(self.alignment) pyreadstat.variable_measure[self.name] = str(self.measure) pyreadstat.variable_display_width[self.name] = self.display_width pyreadstat.variable_storage_width[self.name] = self.storage_width if self.value_metadata is not None: pyreadstat.variable_value_labels[self.name] = self.value_metadata pyreadstat.value_labels[self.label] = self.value_metadata pyreadstat.variable_to_label[self.name] = self.label if self.missing_range_metadata: pyreadstat.missing_ranges[self.name] = [ { 'lo': x.get('low'), 'hi': x.get('high') } for x in self.missing_range_metadata ] if self.missing_value_metadata: pyreadstat.missing_user_values[self.name] = [x for x in self.missing_value_metadata] if self.name not in pyreadstat.column_names: pyreadstat.column_names.append(self.name) pyreadstat.column_labels.append(self.label) else: index = pyreadstat.column_names.index(self.name) pyreadstat.column_names[index] = self.name pyreadstat.column_labels[index] = self.label return pyreadstat
[docs]class Metadata(object): """Object representation of :term:`metadata <Metadata>` retrieved from an SPSS file. """ def __init__(self, **kwargs): self._column_metadata = None self._notes = None self._file_encoding = None self._rows = 0 self._table_name = None self._file_label = None for key in kwargs: setattr(self, key, kwargs.get(key)) @property def column_metadata(self): """Collection of metadata that describes each column or variable within the dataset. :returns: A :class:`dict <python:dict>` where the key is the name of the column/variable and the value is a :class:`ColumnMetadata` object or compatible :class:`dict <python:dict>`. :rtype: :class:`dict <python:dict>` / :obj:`None <python:None>` """ return self._column_metadata @column_metadata.setter def column_metadata(self, value): value = validators.dict(value, allow_empty = True) if not value: self._column_metadata = None else: result = {} for key in value: key = validators.variable_name(key, allow_empty = False) if checkers.is_type(value[key], 'ColumnMetadata'): result[key] = value[key] else: result[key] = ColumnMetadata.from_dict(result[key]) self._column_metadata = result @property def file_encoding(self) -> str: """The file encoding for the dataset. :rtype: :class:`str <python:str>` or :obj:`None <python:None>` """ return self._file_encoding @file_encoding.setter def file_encoding(self, value): self._file_encoding = validators.string(value, allow_empty = True) @property def notes(self) -> str: """Set of notes related to the file. :rtype: :class:`str <python:str>` / :obj:`None <python:None>` """ return self._notes @notes.setter def notes(self, value): if checkers.is_iterable(value): value = '\n'.join(value) self._notes = validators.string(value, allow_empty = True) @property def table_name(self) -> Union[str, None]: """The name of the data table. :rtype: :class:`str <python:str>` / :obj:`None <python:None>` """ return self._table_name @table_name.setter def table_name(self, value): self._table_name = validators.variable_name(value, allow_empty = True) @property def file_label(self) -> Union[str, None]: """The file label. .. note:: This property is irrelevant for SPSS, but is relevant for SAS data. :rtype: :class:`str <python:str>` / :obj:`None <python:None>` """ return self._file_label @file_label.setter def file_label(self, value): self._file_label = validators.string(value, allow_empty = True) @property def columns(self): """The number of columns/variables in the dataset. :rtype: :class:`int <python:int>` """ if not self.column_metadata: return 0 return len(self.column_metadata) @property def rows(self): """The number of cases or rows in the dataset. :rtype: :class:`int <python:int>` """ return self._rows @rows.setter def rows(self, value): self._rows = validators.integer(value, allow_empty = False, coerce_value = False, minimum = 0)
[docs] @classmethod def from_dict(cls, as_dict: dict): """Create a :class:`Metadata` instance from a :class:`dict <python:dict>` representation. :param as_dict: A :class:`dict <python:dict>` representation of the :class:`Metadata`. :type as_dict: :class:`dict <python:dict>` :returns: A :class:`Metadata` instance :rtype: :class:`Metadata` """ instance = cls() instance.notes = as_dict.get('notes', instance.notes) instance.table_name = as_dict.get('table_name', instance.table_name) instance.file_label = as_dict.get('file_label', instance.file_label) instance.rows = as_dict.get('rows', instance.rows) instance.file_encoding = as_dict.get('file_encoding', instance.file_encoding) instance.column_metadata = as_dict.get('column_metadata', instance.column_metadata) return instance
[docs] def to_dict(self) -> dict: """Return a :class:`dict <python:dict>` representation of the instance. :rtype: :class:`dict <python:dict>` """ return { 'table_name': self.table_name, 'file_label': self.file_label, 'file_encoding': self.file_encoding, 'columns': self.columns, 'rows': self.rows, 'column_metadata': self.column_metadata, 'notes': self.notes }
[docs] @classmethod def from_pyreadstat(cls, as_metadata): """Create a :class:`Metadata` instance from a `Pyreadstat <https://github.com/Roche/pyreadstat/>`_ metadata object. :param as_metadata: The `Pyreadstat <https://github.com/Roche/pyreadstat/>`_ metadata object from which the :class:`Metadata` instance should be created. :type as_metadata: :class:`Pyreadstat.metadata_container <pyreadstat:_readstat_parser.metadata_container>` :returns: The :class:`Metadata` instance. :rtype: :class:`Metadata` """ instance = cls() instance.notes = as_metadata.notes instance.table_name = as_metadata.table_name instance.file_label = as_metadata.file_label instance.rows = as_metadata.number_rows instance.file_encoding = as_metadata.file_encoding column_metadata = {} for x in as_metadata.column_names: column_metadata[x] = ColumnMetadata.from_pyreadstat_metadata(x, as_metadata) instance.column_metadata = { x: ColumnMetadata.from_pyreadstat_metadata(x, as_metadata) for x in as_metadata.column_names } return instance
[docs] def to_pyreadstat(self): """Create a `Pyreadstat <https://github.com/Roche/pyreadstat/>`_ metadata representation of the :class:`Metadata` instance. :returns: The `Pyreadstat <https://github.com/Roche/pyreadstat/>`_ metadata. :rtype: :class:`metadata_container <pyreadstat:_readstat_parser.metadata_container` """ as_metadata = metadata_container() as_metadata.table_name = self.table_name as_metadata.file_label = self.file_label as_metadata.file_encoding = self.file_encoding if self.notes and len(self.notes.split('\n')) > 1: notes = self.notes[0] else: notes = self.notes as_metadata.notes = notes as_metadata.rows = self.rows for column in self.column_metadata: as_metadata = self.column_metadata[column].add_to_pyreadstat(as_metadata) return as_metadata