Source code for spss_converter.read

from typing import Union, List, Optional
import tempfile

import os
from io import BytesIO, StringIO
import simplejson as json
import yaml
import pyreadstat
from validator_collection import validators, checkers
from spss_converter import errors
from spss_converter.Metadata import Metadata
from pandas import ExcelWriter


def _read_spss(data: Union[bytes, BytesIO, 'os.PathLike[Any]'],
               limit: Optional[int] = None,
               offset: int = 0,
               exclude_variables: Optional[List[str]] = None,
               include_variables: Optional[List[str]] = None,
               metadata_only: bool = False,
               apply_labels: bool = False,
               labels_as_categories: bool = True,
               missing_as_NaN: bool = False,
               convert_datetimes: bool = True,
               dates_as_datetime64: bool = False,
               **kwargs):
    """Internal function that reads an SPSS (.sav or .zsav) file and returns a
    :class:`tuple <python:tuple>` with a Pandas
    :class:`DataFrame <pandas:pandas.DataFrame>` object and a metadata
    :class:`dict <python:dict>`.

    :param data: The SPSS data to load. Accepts either a series of bytes or a filename.
    :type data: Path-like filename, :class:`bytes <python:bytes>` or
      :class:`BytesIO <python:io.bytesIO>`

    :param limit: The number of records to read from the data. If :obj:`None <python:None>`
      will return all records. Defaults to :obj:`None <python:None>`.
    :type limit: :class:`int <python:int>` or :obj:`None <python:None>`

    :param offset: The record at which to start reading the data. Defaults to 0 (first
      record).
    :type offset: :class:`int <python:int>`

    :param exclude_variables: A list of the variables that should be ignored when reading
      data. Defaults to :obj:`None <python:None>`.
    :type exclude_variables: iterable of :class:`str <python:str>` or
      :obj:`None <python:None>`

    :param include_variables: A list of the variables that should be explicitly included
      when reading data. Defaults to :obj:`None <python:None>`.
    :type include_variables: iterable of :class:`str <python:str>` or
      :obj:`None <python:None>`

    :param metadata_only: If ``True``, will return no data records in the resulting
      :class:`DataFrame <pandas:pandas.DataFrame>` but will return a complete metadata
      :class:`dict <python:dict>`. Defaults to ``False``.
    :type metadata_only: :class:`bool <python:bool>`

    :param apply_labels: If ``True``, converts the numerically-coded values in the raw
      data to their human-readable labels. Defaults to ``False``.
    :type apply_labels: :class:`bool <python:bool>`

    :param labels_as_categories: If ``True``, will convert labeled or formatted values to
      Pandas :term:`categories <pandas:category>`. Defaults to ``True``.

      .. caution::

        This parameter will only have an effect if the ``apply_labels`` parameter is
        ``True``.

    :type labels_as_categories: :class:`bool <python:bool>`

    :param missing_as_NaN: If ``True``, will return any missing values as
      :class:`NaN <pandas:NaN>`. Otherwise will return missing values as per the
      configuration of missing value representation stored in the underlying SPSS data.
      Defaults to ``False``, which applies the missing value representation configured in
      the SPSS data itself.
    :type missing_as_NaN: :class:`bool <python:bool>`

    :param convert_datetimes: if ``True``, will convert the native integer representation
      of datetime values in the SPSS data to Pythonic
      :class:`datetime <python:datetime.datetime>`, or
      :class:`date <python:datetime.date>`, etc. representations (or Pandas
      :class:`datetime64 <pandas:datetime64>`, depending on the ``dates_as_datetime64``
      parameter). If ``False``, will leave the original integer representation. Defaults
      to ``True``.
    :type convert_datetimes: :class:`bool <python:bool>`

    :param dates_as_datetime64: If ``True``, will return any date values as Pandas
      :class:`datetime64 <pandas.datetime64>` types. Defaults to ``False``.

      .. caution::

        This parameter is only applied if ``convert_datetimes`` is set to ``True``.

    :type dates_as_datetime64: :class:`bool <python:bool>`

    :returns: A :class:`DataFrame <pandas:DataFrame>` representation of the SPSS data (or
      :obj:`None <python:None>`) and a :class:`Metadata` representation of the dataset's
      metadata / data map.
    :rtype: :class:`pandas.DataFrame <pandas:DataFrame>`/:obj:`None <python:None>` and
      :class:`Metadata`

    """
    if not any([checkers.is_file(data),
                checkers.is_bytesIO(data),
                checkers.is_type(data, bytes)]):
        raise errors.InvalidDataFormatError('data must be a filename, BytesIO, or bytes '
                                            f'object. Was: {data.__class__.__name__}')

    limit = validators.integer(limit, allow_empty = True, minimum = 0)
    offset = validators.integer(offset, minimum = 0)

    exclude_variables = validators.iterable(exclude_variables, allow_empty = True)
    if exclude_variables:
        exclude_variables = [validators.string(x) for x in exclude_variables]

    include_variables = validators.iterable(include_variables, allow_empty = True)
    if include_variables:
        include_variables = [validators.string(x) for x in include_variables]

    if not checkers.is_file(data):
        with tempfile.NamedTemporaryFile(delete = False) as temp_file:
            temp_file.write(data)
            temp_file_name = temp_file.name

        df, meta = pyreadstat.read_sav(temp_file_name,
                                       metadataonly = metadata_only,
                                       dates_as_pandas_datetime = dates_as_datetime64,
                                       apply_value_formats = apply_labels,
                                       formats_as_category = labels_as_categories,
                                       usecols = include_variables,
                                       user_missing = not missing_as_NaN,
                                       disable_datetime_conversion = not convert_datetimes,
                                       row_limit = limit or 0,
                                       row_offset = offset,
                                       **kwargs)
        os.remove(temp_file_name)
    else:
        df, meta = pyreadstat.read_sav(data,
                                       metadataonly = metadata_only,
                                       dates_as_pandas_datetime = dates_as_datetime64,
                                       apply_value_formats = apply_labels,
                                       formats_as_category = labels_as_categories,
                                       usecols = include_variables,
                                       user_missing = not missing_as_NaN,
                                       disable_datetime_conversion = not convert_datetimes,
                                       row_limit = limit or 0,
                                       row_offset = offset,
                                       **kwargs)

    metadata = Metadata.from_pyreadstat(meta)

    if exclude_variables:
        df = df.drop(exclude_variables, axis = 1)
        if metadata.column_metadata:
            for variable in exclude_variables:
                metadata.column_metadata.pop(variable, None)

    return df, metadata


[docs]def get_metadata(data): """Retrieve the metadata that describes the coded representation of the data, corresponding formatting information, and their related human-readable labels. :param data: The SPSS data to load. Accepts either a series of bytes or a filename. :type data: Path-like filename, :class:`bytes <python:bytes>` or :class:`BytesIO <python:io.bytesIO>` :returns: The metadata that describes the raw data and its corresponding labels. :rtype: :class:`Metadata` """ return _read_spss(data, metadata_only = True)[1]
[docs]def to_dataframe(data: Union[bytes, BytesIO, 'os.PathLike[Any]'], limit: Optional[int] = None, offset: int = 0, exclude_variables: Optional[List[str]] = None, include_variables: Optional[List[str]] = None, metadata_only: bool = False, apply_labels: bool = False, labels_as_categories: bool = True, missing_as_NaN: bool = False, convert_datetimes: bool = True, dates_as_datetime64: bool = False, **kwargs): """Reads SPSS data and returns a :class:`tuple <python:tuple>` with a Pandas :class:`DataFrame <pandas:pandas.DataFrame>` object and relevant :class:`Metadata`. :param data: The SPSS data to load. Accepts either a series of bytes or a filename. :type data: Path-like filename, :class:`bytes <python:bytes>` or :class:`BytesIO <python:io.bytesIO>` :param limit: The number of records to read from the data. If :obj:`None <python:None>` will return all records. Defaults to :obj:`None <python:None>`. :type limit: :class:`int <python:int>` or :obj:`None <python:None>` :param offset: The record at which to start reading the data. Defaults to 0 (first record). :type offset: :class:`int <python:int>` :param exclude_variables: A list of the variables that should be ignored when reading data. Defaults to :obj:`None <python:None>`. :type exclude_variables: iterable of :class:`str <python:str>` or :obj:`None <python:None>` :param include_variables: A list of the variables that should be explicitly included when reading data. Defaults to :obj:`None <python:None>`. :type include_variables: iterable of :class:`str <python:str>` or :obj:`None <python:None>` :param metadata_only: If ``True``, will return no data records in the resulting :class:`DataFrame <pandas:pandas.DataFrame>` but will return a complete :class:`Metadata` instance. Defaults to ``False``. :type metadata_only: :class:`bool <python:bool>` :param apply_labels: If ``True``, converts the numerically-coded values in the raw data to their human-readable labels. Defaults to ``False``. :type apply_labels: :class:`bool <python:bool>` :param labels_as_categories: If ``True``, will convert labeled or formatted values to Pandas :term:`categories <pandas:category>`. Defaults to ``True``. .. caution:: This parameter will only have an effect if the ``apply_labels`` parameter is ``True``. :type labels_as_categories: :class:`bool <python:bool>` :param missing_as_NaN: If ``True``, will return any missing values as :class:`NaN <pandas:NaN>`. Otherwise will return missing values as per the configuration of missing value representation stored in the underlying SPSS data. Defaults to ``False``, which applies the missing value representation configured in the SPSS data itself. :type missing_as_NaN: :class:`bool <python:bool>` :param convert_datetimes: if ``True``, will convert the native integer representation of datetime values in the SPSS data to Pythonic :class:`datetime <python:datetime.datetime>`, or :class:`date <python:datetime.date>`, etc. representations (or Pandas :class:`datetime64 <pandas:datetime64>`, depending on the ``dates_as_datetime64`` parameter). If ``False``, will leave the original integer representation. Defaults to ``True``. :type convert_datetimes: :class:`bool <python:bool>` :param dates_as_datetime64: If ``True``, will return any date values as Pandas :class:`datetime64 <pandas.datetime64>` types. Defaults to ``False``. .. caution:: This parameter is only applied if ``convert_datetimes`` is set to ``True``. :type dates_as_datetime64: :class:`bool <python:bool>` :returns: A :class:`DataFrame <pandas:DataFrame>` representation of the SPSS data (or :obj:`None <python:None>`) and a :class:`Metadata <Metadata>` representation of the data's meta-data (value and labels / data map). :rtype: :class:`pandas.DataFrame <pandas:DataFrame>`/:obj:`None <python:None>` and :class:`Metadata <Metadata>` """ return _read_spss(data, limit = limit, offset = offset, exclude_variables = exclude_variables, include_variables = include_variables, metadata_only = metadata_only, apply_labels = apply_labels, labels_as_categories = labels_as_categories, missing_as_NaN = missing_as_NaN, convert_datetimes = convert_datetimes, dates_as_datetime64 = dates_as_datetime64, **kwargs)
[docs]def to_csv(data: Union['os.PathLike[Any]', BytesIO, bytes], target: Optional[Union['os.PathLike[Any]', StringIO]] = None, include_header: bool = True, delimter: str = '|', null_text: str = 'NaN', wrapper_character: str = "'", escape_character: str = "\\", line_terminator: str = '\r\n', decimal: str = '.', limit: Optional[int] = None, offset: int = 0, exclude_variables: Optional[List[str]] = None, include_variables: Optional[List[str]] = None, metadata_only: bool = False, apply_labels: bool = False, labels_as_categories: bool = True, missing_as_NaN: bool = False, convert_datetimes: bool = True, dates_as_datetime64: bool = False, **kwargs): r"""Convert the SPSS ``data`` into a CSV string where each row represents a record of SPSS data. :param data: The SPSS data to load. Accepts either a series of bytes or a filename. :type data: Path-like filename, :class:`bytes <python:bytes>` or :class:`BytesIO <python:io.bytesIO>` :param target: The destination where the CSV representation should be stored. Accepts either a filename, file-pointer or a :class:`StringIO <python:io.StringIO>`, or :obj:`None <python:None>`. If :obj:`None <python:None>`, will return a :class:`str <python:str>` object stored in-memory. Defaults to :obj:`None <python:None>`. :type target: Path-like / :class:`StringIO <python:io.StringIO>` / :class:`str <python:str>` / :obj:`None <python:None>` :param include_header: If ``True``, will include a header row with column labels. If ``False``, will not include a header row. Defaults to ``True``. :type include_header: :class:`bool <python:bool>` :param delimiter: The delimiter used between columns. Defaults to ``|``. :type delimiter: :class:`str <python:str>` :param null_text: The text value to use in place of empty values. Only applies if ``wrap_empty_values`` is ``True``. Defaults to ``'NaN'``. :type null_text: :class:`str <python:str>` :param wrapper_character: The string used to wrap string values when wrapping is necessary. Defaults to ``'``. :type wrapper_character: :class:`str <python:str>` :param escape_character: The character to use when escaping nested wrapper characters. Defaults to ``\``. :type escape_character: :class:`str <python:str>` :param line_terminator: The character used to mark the end of a line. Defaults to ``\r\n``. :type line_terminator: :class:`str <python:str>` :param decimal: The character used to indicate a decimal place in a numerical value. Defaults to ``.``. :type decimal: :class:`str <python:str>` :param limit: The number of records to read from the data. If :obj:`None <python:None>` will return all records. Defaults to :obj:`None <python:None>`. :type limit: :class:`int <python:int>` or :obj:`None <python:None>` :param offset: The record at which to start reading the data. Defaults to 0 (first record). :type offset: :class:`int <python:int>` :param exclude_variables: A list of the variables that should be ignored when reading data. Defaults to :obj:`None <python:None>`. :type exclude_variables: iterable of :class:`str <python:str>` or :obj:`None <python:None>` :param include_variables: A list of the variables that should be explicitly included when reading data. Defaults to :obj:`None <python:None>`. :type include_variables: iterable of :class:`str <python:str>` or :obj:`None <python:None>` :param metadata_only: If ``True``, will return no data records in the resulting :class:`DataFrame <pandas:pandas.DataFrame>` but will return a complete :class:`Metadata` instance. Defaults to ``False``. :type metadata_only: :class:`bool <python:bool>` :param apply_labels: If ``True``, converts the numerically-coded values in the raw data to their human-readable labels. Defaults to ``False``. :type apply_labels: :class:`bool <python:bool>` :param labels_as_categories: If ``True``, will convert labeled or formatted values to Pandas :term:`categories <pandas:category>`. Defaults to ``True``. .. caution:: This parameter will only have an effect if the ``apply_labels`` parameter is ``True``. :type labels_as_categories: :class:`bool <python:bool>` :param missing_as_NaN: If ``True``, will return any missing values as :class:`NaN <pandas:NaN>`. Otherwise will return missing values as per the configuration of missing value representation stored in the underlying SPSS data. Defaults to ``False``, which applies the missing value representation configured in the SPSS data itself. :type missing_as_NaN: :class:`bool <python:bool>` :param convert_datetimes: if ``True``, will convert the native integer representation of datetime values in the SPSS data to Pythonic :class:`datetime <python:datetime.datetime>`, or :class:`date <python:datetime.date>`, etc. representations (or Pandas :class:`datetime64 <pandas:datetime64>`, depending on the ``dates_as_datetime64`` parameter). If ``False``, will leave the original integer representation. Defaults to ``True``. :type convert_datetimes: :class:`bool <python:bool>` :param dates_as_datetime64: If ``True``, will return any date values as Pandas :class:`datetime64 <pandas.datetime64>` types. Defaults to ``False``. .. caution:: This parameter is only applied if ``convert_datetimes`` is set to ``True``. :type dates_as_datetime64: :class:`bool <python:bool>` :returns: :obj:`None <python:None>` if ``target`` was not :obj:`None <python:None>`, otherwise a :class:`str <python:str>` representation of the CSV file. :rtype: :obj:`None <python:None>` or :class:`str <python:str>` """ if target and not checkers.is_pathlike(target) and not checkers.is_stringIO(target): raise errors.InvalidDataFormatError('target must be a filename, StringIO object, ' f'or None. Was: {data.__class__.__name__}') df, metadata = _read_spss(data, limit = limit, offset = offset, exclude_variables = exclude_variables, include_variables = include_variables, metadata_only = metadata_only, apply_labels = apply_labels, labels_as_categories = labels_as_categories, missing_as_NaN = missing_as_NaN, convert_datetimes = convert_datetimes, dates_as_datetime64 = dates_as_datetime64, **kwargs) result = df.to_csv(target, sep = delimter, na_rep = null_text, header = include_header, quotechar = wrapper_character, line_terminator = line_terminator, escapechar = escape_character, decimal = decimal) if target is not None: return None return result
[docs]def to_json(data: Union['os.PathLike[Any]', BytesIO, bytes], target: Optional[Union['os.PathLike[Any]', StringIO]] = None, layout: str = 'records', double_precision: int = 10, limit: Optional[int] = None, offset: int = 0, exclude_variables: Optional[List[str]] = None, include_variables: Optional[List[str]] = None, metadata_only: bool = False, apply_labels: bool = False, labels_as_categories: bool = True, missing_as_NaN: bool = False, convert_datetimes: bool = True, dates_as_datetime64: bool = False, **kwargs): r"""Convert the SPSS ``data`` into a JSON string. :param data: The SPSS data to load. Accepts either a series of bytes or a filename. :type data: Path-like filename, :class:`bytes <python:bytes>` or :class:`BytesIO <python:io.bytesIO>` :param target: The destination where the JSON representation should be stored. Accepts either a filename, file-pointer or :class:`StringIO <python:io.StringIO>`, or :obj:`None <python:None>`. If :obj:`None <python:None>`, will return a :class:`str <python:str>` object stored in-memory. Defaults to :obj:`None <python:None>`. :type target: Path-like / :class:`StringIO <python:io.StringIO>` / :class:`str <python:str>` / :obj:`None <python:None>` :param layout: Indicates the layout schema to use for the JSON representation of the data. Accepts: * ``records``, where the resulting JSON object represents an array of objects where each object corresponds to a single record, with key/value pairs for each column and that record's corresponding value * ``table``, where the resulting JSON object contains a metadata (data map) describing the data schema along with the resulting collection of record objects Defaults to ``records``. :type layout: :class:`str <python:str>` :param double_precision: Indicates the precision (places beyond the decimal point) to apply for floating point values. Defaults to ``10``. :type double_precision: class:`int <python:int>` :param limit: The number of records to read from the data. If :obj:`None <python:None>` will return all records. Defaults to :obj:`None <python:None>`. :type limit: :class:`int <python:int>` or :obj:`None <python:None>` :param offset: The record at which to start reading the data. Defaults to 0 (first record). :type offset: :class:`int <python:int>` :param exclude_variables: A list of the variables that should be ignored when reading data. Defaults to :obj:`None <python:None>`. :type exclude_variables: iterable of :class:`str <python:str>` or :obj:`None <python:None>` :param include_variables: A list of the variables that should be explicitly included when reading data. Defaults to :obj:`None <python:None>`. :type include_variables: iterable of :class:`str <python:str>` or :obj:`None <python:None>` :param metadata_only: If ``True``, will return no data records in the resulting :class:`DataFrame <pandas:pandas.DataFrame>` but will return a complete :class:`Metadata` instance. Defaults to ``False``. :type metadata_only: :class:`bool <python:bool>` :param apply_labels: If ``True``, converts the numerically-coded values in the raw data to their human-readable labels. Defaults to ``False``. :type apply_labels: :class:`bool <python:bool>` :param labels_as_categories: If ``True``, will convert labeled or formatted values to Pandas :term:`categories <pandas:category>`. Defaults to ``True``. .. caution:: This parameter will only have an effect if the ``apply_labels`` parameter is ``True``. :type labels_as_categories: :class:`bool <python:bool>` :param missing_as_NaN: If ``True``, will return any missing values as :class:`NaN <pandas:NaN>`. Otherwise will return missing values as per the configuration of missing value representation stored in the underlying SPSS data. Defaults to ``False``, which applies the missing value representation configured in the SPSS data itself. :type missing_as_NaN: :class:`bool <python:bool>` :param convert_datetimes: if ``True``, will convert the native integer representation of datetime values in the SPSS data to Pythonic :class:`datetime <python:datetime.datetime>`, or :class:`date <python:datetime.date>`, etc. representations (or Pandas :class:`datetime64 <pandas:datetime64>`, depending on the ``dates_as_datetime64`` parameter). If ``False``, will leave the original integer representation. Defaults to ``True``. :type convert_datetimes: :class:`bool <python:bool>` :param dates_as_datetime64: If ``True``, will return any date values as Pandas :class:`datetime64 <pandas.datetime64>` types. Defaults to ``False``. .. caution:: This parameter is only applied if ``convert_datetimes`` is set to ``True``. :type dates_as_datetime64: :class:`bool <python:bool>` :returns: :obj:`None <python:None>` if ``target`` was not :obj:`None <python:None>`, otherwise a :class:`str <python:str>` representation of the JSON output. :rtype: :obj:`None <python:None>` or :class:`str <python:str>` """ if target and not checkers.is_pathlike(target) and not checkers.is_stringIO(target): raise errors.InvalidDataFormatError('target must be a filename, StringIO object, ' f'or None. Was: {data.__class__.__name__}') if layout not in ['records', 'table']: raise errors.InvalidLayoutError('layout must be either "records" or "table". ' f'Was: "{layout}"') df, metadata = _read_spss(data, limit = limit, offset = offset, exclude_variables = exclude_variables, include_variables = include_variables, metadata_only = metadata_only, apply_labels = apply_labels, labels_as_categories = labels_as_categories, missing_as_NaN = missing_as_NaN, convert_datetimes = convert_datetimes, dates_as_datetime64 = dates_as_datetime64, **kwargs) result = df.to_json(target, orient = layout, double_precision = double_precision) if target is not None: return None return result
[docs]def to_yaml(data: Union['os.PathLike[Any]', BytesIO, bytes], target: Optional[Union['os.PathLike[Any]', StringIO]] = None, layout: str = 'records', double_precision: int = 10, limit: Optional[int] = None, offset: int = 0, exclude_variables: Optional[List[str]] = None, include_variables: Optional[List[str]] = None, metadata_only: bool = False, apply_labels: bool = False, labels_as_categories: bool = True, missing_as_NaN: bool = False, convert_datetimes: bool = True, dates_as_datetime64: bool = False, **kwargs): r"""Convert the SPSS ``data`` into a YAML string. :param data: The SPSS data to load. Accepts either a series of bytes or a filename. :type data: Path-like filename, :class:`bytes <python:bytes>` or :class:`BytesIO <python:io.bytesIO>` :param target: The destination where the YAML representation should be stored. Accepts either a filename, file-pointer or :class:`StringIO <python:io.StringIO>`, or :obj:`None <python:None>`. If :obj:`None <python:None>`, will return a :class:`str <python:str>` object stored in-memory. Defaults to :obj:`None <python:None>`. :type target: Path-like / :class:`StringIO <python:io.StringIO>` / :class:`str <python:str>` / :obj:`None <python:None>` :param layout: Indicates the layout schema to use for the JSON representation of the data. Accepts: * ``records``, where the resulting YAML object represents an array of objects where each object corresponds to a single record, with key/value pairs for each column and that record's corresponding value * ``table``, where the resulting JSON object contains a metadata (data map) describing the data schema along with the resulting collection of record objects Defaults to ``records``. :type layout: :class:`str <python:str>` :param double_precision: Indicates the precision (places beyond the decimal point) to apply for floating point values. Defaults to ``10``. :type double_precision: class:`int <python:int>` :param limit: The number of records to read from the data. If :obj:`None <python:None>` will return all records. Defaults to :obj:`None <python:None>`. :type limit: :class:`int <python:int>` or :obj:`None <python:None>` :param offset: The record at which to start reading the data. Defaults to 0 (first record). :type offset: :class:`int <python:int>` :param exclude_variables: A list of the variables that should be ignored when reading data. Defaults to :obj:`None <python:None>`. :type exclude_variables: iterable of :class:`str <python:str>` or :obj:`None <python:None>` :param include_variables: A list of the variables that should be explicitly included when reading data. Defaults to :obj:`None <python:None>`. :type include_variables: iterable of :class:`str <python:str>` or :obj:`None <python:None>` :param metadata_only: If ``True``, will return no data records in the resulting :class:`DataFrame <pandas:pandas.DataFrame>` but will return a complete :class:`Metadata` instance. Defaults to ``False``. :type metadata_only: :class:`bool <python:bool>` :param apply_labels: If ``True``, converts the numerically-coded values in the raw data to their human-readable labels. Defaults to ``False``. :type apply_labels: :class:`bool <python:bool>` :param labels_as_categories: If ``True``, will convert labeled or formatted values to Pandas :term:`categories <pandas:category>`. Defaults to ``True``. .. caution:: This parameter will only have an effect if the ``apply_labels`` parameter is ``True``. :type labels_as_categories: :class:`bool <python:bool>` :param missing_as_NaN: If ``True``, will return any missing values as :class:`NaN <pandas:NaN>`. Otherwise will return missing values as per the configuration of missing value representation stored in the underlying SPSS data. Defaults to ``False``, which applies the missing value representation configured in the SPSS data itself. :type missing_as_NaN: :class:`bool <python:bool>` :param convert_datetimes: if ``True``, will convert the native integer representation of datetime values in the SPSS data to Pythonic :class:`datetime <python:datetime.datetime>`, or :class:`date <python:datetime.date>`, etc. representations (or Pandas :class:`datetime64 <pandas:datetime64>`, depending on the ``dates_as_datetime64`` parameter). If ``False``, will leave the original integer representation. Defaults to ``True``. :type convert_datetimes: :class:`bool <python:bool>` :param dates_as_datetime64: If ``True``, will return any date values as Pandas :class:`datetime64 <pandas.datetime64>` types. Defaults to ``False``. .. caution:: This parameter is only applied if ``convert_datetimes`` is set to ``True``. :type dates_as_datetime64: :class:`bool <python:bool>` :returns: :obj:`None <python:None>` if ``target`` was not :obj:`None <python:None>`, otherwise a :class:`str <python:str>` representation of the YAML output. :rtype: :obj:`None <python:None>` or :class:`str <python:str>` """ if target and not checkers.is_pathlike(target) and not checkers.is_stringIO(target): raise errors.InvalidDataFormatError('target must be a filename, StringIO object, ' f'or None. Was: {data.__class__.__name__}') if layout not in ['records', 'table']: raise errors.InvalidLayoutError('layout must be either "records" or "table". ' f'Was: "{layout}"') df, metadata = _read_spss(data, limit = limit, offset = offset, exclude_variables = exclude_variables, include_variables = include_variables, metadata_only = metadata_only, apply_labels = apply_labels, labels_as_categories = labels_as_categories, missing_as_NaN = missing_as_NaN, convert_datetimes = convert_datetimes, dates_as_datetime64 = dates_as_datetime64, **kwargs) as_json = df.to_json(None, orient = layout, double_precision = double_precision) as_dict = json.loads(as_json) as_yaml = yaml.dump(as_dict) if target is None: return as_yaml with open(target, 'wb') as target_file: target_file.write(as_yaml)
[docs]def to_dict(data: Union['os.PathLike[Any]', BytesIO, bytes], layout: str = 'records', double_precision: int = 10, limit: Optional[int] = None, offset: int = 0, exclude_variables: Optional[List[str]] = None, include_variables: Optional[List[str]] = None, metadata_only: bool = False, apply_labels: bool = False, labels_as_categories: bool = True, missing_as_NaN: bool = False, convert_datetimes: bool = True, dates_as_datetime64: bool = False, **kwargs): r"""Convert the SPSS ``data`` into a Python :class:`dict <python:dict>`. :param data: The SPSS data to load. Accepts either a series of bytes or a filename. :type data: Path-like filename, :class:`bytes <python:bytes>` or :class:`BytesIO <python:io.bytesIO>` :param layout: Indicates the layout schema to use for the JSON representation of the data. Accepts: * ``records``, where the resulting YAML object represents an array of objects where each object corresponds to a single record, with key/value pairs for each column and that record's corresponding value * ``table``, where the resulting JSON object contains a metadata (data map) describing the data schema along with the resulting collection of record objects Defaults to ``records``. :type layout: :class:`str <python:str>` :param double_precision: Indicates the precision (places beyond the decimal point) to apply for floating point values. Defaults to ``10``. :type double_precision: class:`int <python:int>` :param limit: The number of records to read from the data. If :obj:`None <python:None>` will return all records. Defaults to :obj:`None <python:None>`. :type limit: :class:`int <python:int>` or :obj:`None <python:None>` :param offset: The record at which to start reading the data. Defaults to 0 (first record). :type offset: :class:`int <python:int>` :param exclude_variables: A list of the variables that should be ignored when reading data. Defaults to :obj:`None <python:None>`. :type exclude_variables: iterable of :class:`str <python:str>` or :obj:`None <python:None>` :param include_variables: A list of the variables that should be explicitly included when reading data. Defaults to :obj:`None <python:None>`. :type include_variables: iterable of :class:`str <python:str>` or :obj:`None <python:None>` :param metadata_only: If ``True``, will return no data records in the resulting :class:`DataFrame <pandas:pandas.DataFrame>` but will return a complete :class:`Metadata` instance. Defaults to ``False``. :type metadata_only: :class:`bool <python:bool>` :param apply_labels: If ``True``, converts the numerically-coded values in the raw data to their human-readable labels. Defaults to ``False``. :type apply_labels: :class:`bool <python:bool>` :param labels_as_categories: If ``True``, will convert labeled or formatted values to Pandas :term:`categories <pandas:category>`. Defaults to ``True``. .. caution:: This parameter will only have an effect if the ``apply_labels`` parameter is ``True``. :type labels_as_categories: :class:`bool <python:bool>` :param missing_as_NaN: If ``True``, will return any missing values as :class:`NaN <pandas:NaN>`. Otherwise will return missing values as per the configuration of missing value representation stored in the underlying SPSS data. Defaults to ``False``, which applies the missing value representation configured in the SPSS data itself. :type missing_as_NaN: :class:`bool <python:bool>` :param convert_datetimes: if ``True``, will convert the native integer representation of datetime values in the SPSS data to Pythonic :class:`datetime <python:datetime.datetime>`, or :class:`date <python:datetime.date>`, etc. representations (or Pandas :class:`datetime64 <pandas:datetime64>`, depending on the ``dates_as_datetime64`` parameter). If ``False``, will leave the original integer representation. Defaults to ``True``. :type convert_datetimes: :class:`bool <python:bool>` :param dates_as_datetime64: If ``True``, will return any date values as Pandas :class:`datetime64 <pandas.datetime64>` types. Defaults to ``False``. .. caution:: This parameter is only applied if ``convert_datetimes`` is set to ``True``. :type dates_as_datetime64: :class:`bool <python:bool>` :returns: :obj:`None <python:None>` if ``target`` was not :obj:`None <python:None>`, otherwise a :class:`list <python:list>` of :class:`dict <python:dict>` if ``layout`` is ``records``, or a :class:`dict <python:dict>` if ``layout`` is ``table``. :rtype: :obj:`None <python:None>` or :class:`str <python:str>` """ if layout not in ['records', 'table']: raise errors.InvalidLayoutError('layout must be either "records" or "table". ' f'Was: "{layout}"') as_json = to_json(data, layout = layout, double_precision = double_precision, limit = limit, offset = offset, exclude_variables = exclude_variables, include_variables = include_variables, metadata_only = metadata_only, apply_labels = apply_labels, labels_as_categories = labels_as_categories, missing_as_NaN = missing_as_NaN, convert_datetimes = convert_datetimes, dates_as_datetime64 = dates_as_datetime64, **kwargs) as_dict = json.loads(as_json) return as_dict
[docs]def to_excel(data: Union['os.PathLike[Any]', BytesIO, bytes], target: Optional[Union['os.PathLike[Any]', BytesIO, ExcelWriter]] = None, sheet_name: str = 'Sheet1', start_row: int = 0, start_column: int = 0, null_text: str = 'NaN', include_header: bool = True, limit: Optional[int] = None, offset: int = 0, exclude_variables: Optional[List[str]] = None, include_variables: Optional[List[str]] = None, metadata_only: bool = False, apply_labels: bool = False, labels_as_categories: bool = True, missing_as_NaN: bool = False, convert_datetimes: bool = True, dates_as_datetime64: bool = False, **kwargs): r"""Convert the SPSS ``data`` into an Excel file where each row represents a record of SPSS data. :param data: The SPSS data to load. Accepts either a series of bytes or a filename. :type data: Path-like filename, :class:`bytes <python:bytes>` or :class:`BytesIO <python:io.bytesIO>` :param target: The destination where the Excel file should be stored. Accepts either a filename, file-pointer or a :class:`BytesIO <python:io.BytesIO>`, or an :class:`ExcelWriter <pandas:pandas.ExcelWriter>` instance. :type target: Path-like / :class:`BytesIO <python:io.BytesIO>` / :class:`ExcelWriter <pandas:pandas.ExcelWriter>` :param sheet_name: The worksheet on which the SPSS data should be written. Defaults to ``'Sheet1'``. :type sheet_name: :class:`str <python:str>` :param start_row: The row number (starting at 0) where the SPSS data should begin. Defaults to ``0``. :type start_row: :class:`int <python:int>` :param start_column: The column number (starting at 0) where the SPSS data should begin. Defaults to ``0``. :type start_column: :class:`int <python:int>` :param null_text: The way that missing values should be represented in the Excel file. Defaults to ``''`` (an empty string). :type null_text: :class:`str <python:str>` :param include_header: If ``True``, will include a header row with column labels. If ``False``, will not include a header row. Defaults to ``True``. :type include_header: :class:`bool <python:bool>` :param limit: The number of records to read from the data. If :obj:`None <python:None>` will return all records. Defaults to :obj:`None <python:None>`. :type limit: :class:`int <python:int>` or :obj:`None <python:None>` :param offset: The record at which to start reading the data. Defaults to 0 (first record). :type offset: :class:`int <python:int>` :param exclude_variables: A list of the variables that should be ignored when reading data. Defaults to :obj:`None <python:None>`. :type exclude_variables: iterable of :class:`str <python:str>` or :obj:`None <python:None>` :param include_variables: A list of the variables that should be explicitly included when reading data. Defaults to :obj:`None <python:None>`. :type include_variables: iterable of :class:`str <python:str>` or :obj:`None <python:None>` :param metadata_only: If ``True``, will return no data records in the resulting :class:`DataFrame <pandas:pandas.DataFrame>` but will return a complete :class:`Metadata` instance. Defaults to ``False``. :type metadata_only: :class:`bool <python:bool>` :param apply_labels: If ``True``, converts the numerically-coded values in the raw data to their human-readable labels. Defaults to ``False``. :type apply_labels: :class:`bool <python:bool>` :param labels_as_categories: If ``True``, will convert labeled or formatted values to Pandas :term:`categories <pandas:category>`. Defaults to ``True``. .. caution:: This parameter will only have an effect if the ``apply_labels`` parameter is ``True``. :type labels_as_categories: :class:`bool <python:bool>` :param missing_as_NaN: If ``True``, will return any missing values as :class:`NaN <pandas:NaN>`. Otherwise will return missing values as per the configuration of missing value representation stored in the underlying SPSS data. Defaults to ``False``, which applies the missing value representation configured in the SPSS data itself. :type missing_as_NaN: :class:`bool <python:bool>` :param convert_datetimes: if ``True``, will convert the native integer representation of datetime values in the SPSS data to Pythonic :class:`datetime <python:datetime.datetime>`, or :class:`date <python:datetime.date>`, etc. representations (or Pandas :class:`datetime64 <pandas:datetime64>`, depending on the ``dates_as_datetime64`` parameter). If ``False``, will leave the original integer representation. Defaults to ``True``. :type convert_datetimes: :class:`bool <python:bool>` :param dates_as_datetime64: If ``True``, will return any date values as Pandas :class:`datetime64 <pandas.datetime64>` types. Defaults to ``False``. .. caution:: This parameter is only applied if ``convert_datetimes`` is set to ``True``. :type dates_as_datetime64: :class:`bool <python:bool>` :returns: :obj:`None <python:None>` if ``target`` was not :obj:`None <python:None>`, otherwise a :class:`BytesIO <python:BytesIO>` representation of the Excel file. :rtype: :obj:`None <python:None>` or :class:`str <python:str>` """ if target and \ not checkers.is_pathlike(target) and \ not checkers.is_bytesIO(target) and \ not checkers.is_type(target, 'ExcelWriter'): raise errors.InvalidDataFormatError('target must be a filename, BytesIO, ' f'ExcelWriter, or None. ' f'Was: {data.__class__.__name__}') df, metadata = _read_spss(data, limit = limit, offset = offset, exclude_variables = exclude_variables, include_variables = include_variables, metadata_only = metadata_only, apply_labels = apply_labels, labels_as_categories = labels_as_categories, missing_as_NaN = missing_as_NaN, convert_datetimes = convert_datetimes, dates_as_datetime64 = dates_as_datetime64, **kwargs) return_target = False if not target or checkers.is_bytesIO(target): return_target = True target = BytesIO() df.to_excel(target, sheet_name = sheet_name, na_rep = null_text, header = include_header, startrow = start_row, startcol = start_column) if return_target: return target