Source code for openepda.main

# -*- coding: utf-8 -*-
"""openepda.main.py

This module contains utility functions and openEPDA data file writer and
loader.

Author: Dima Pustakhod
Copyright: 2020--2022, TU/e - PITC and authors
"""
import collections.abc as coll
import os
from csv import QUOTE_NONNUMERIC
from datetime import datetime

import numpy as np
import pandas as pd
from pandas.errors import ParserError
from ruamel.yaml import YAML

DFLT_DATA_FILE_ENCODING = "utf-8"


[docs]def is_list_or_tuple(x): """ Check if a variable is either a tuple or a list Returns ------- bool True if x is a tuple or list, or a subclass of these; False otherwise Examples -------- >>> i = 1; f = 1.1; st = 's'; bs = b'S' >>> d = {1: 1, 2: 2}; s = {1, 2, 3} >>> l = [1, 2]; t = (1, 2) >>> arr = np.asarray([1, 2, 3]); m = np.array([1, 2, 3]) >>> is_list_or_tuple(i), is_list_or_tuple(f) (False, False) >>> is_list_or_tuple(st), is_list_or_tuple(bs) (False, False) >>> is_list_or_tuple(d), is_list_or_tuple(s) (False, False) >>> is_list_or_tuple(l), is_list_or_tuple(t) (True, True) >>> is_list_or_tuple(arr), is_list_or_tuple(m) (False, False) """ is_sequence = isinstance(x, coll.Sequence) is_str = isinstance(x, (str, coll.ByteString)) return is_sequence and not is_str
[docs]def is_array_like(x): """ Check if a variable is array-like Returns ------- bool True if x is a tuple, a list, a numpy.ndarray, or a subclass of these; False otherwise Examples -------- >>> i = 1; f = 1.1; st = 's'; bs = b'S' >>> d = {1: 1, 2: 2}; s = {1, 2, 3} >>> l = [1, 2]; t = (1, 2) >>> arr = np.asarray([1, 2, 3]); m = np.array([1, 2, 3]) >>> is_array_like(i), is_array_like(f) (False, False) >>> is_array_like(st), is_array_like(bs) (False, False) >>> is_array_like(d), is_array_like(s) (False, False) >>> is_array_like(l), is_array_like(t) (True, True) >>> is_array_like(arr), is_array_like(m) (True, True) """ x_is_list_or_tuple = is_list_or_tuple(x) is_ndarray = isinstance(x, np.ndarray) return x_is_list_or_tuple or is_ndarray
class OpenEpdaDataLoader(object): def __init__(self, encoding=None): super().__init__() self._encoding = encoding or DFLT_DATA_FILE_ENCODING def read_file(self, fname): with open(fname, "r", encoding=self._encoding) as f: lines = [] separator_index = 0 for i, line in enumerate(f): if line != "---\n" and line != "...\n": lines.append(line) else: separator_index = i break if separator_index == 0: raise ValueError( 'File "{}" does not confirm to the OpenEpda ' 'Data format: "---" or "..." line is missing. File will ' "be skipped.".format(os.path.basename(fname)) ) yaml = YAML(typ="safe") yaml.default_flow_style = False yaml.explicit_end = None data = "\n".join(lines) file_data = yaml.load(data) try: data = pd.read_csv( fname, header=separator_index + 1, encoding=self._encoding ) for col_name in data.columns: file_data.update({col_name: data[col_name].values}) except ParserError: pass return file_data
[docs]def split_array_data(data, how="max_length"): """ Parameters ---------- how : str 'max_length|most_common' Examples -------- >>> split_array_data({2: [1, 2], '2a': [1, 4]}) ({}, {2: [1, 2], '2a': [1, 4]}) >>> split_array_data({2: [1, 2], 3: [1, 2, 3], '3a': [1, 2, 4]}) ({2: [1, 2]}, {3: [1, 2, 3], '3a': [1, 2, 4]}) >>> split_array_data({2: [1, 2, 3], 3: [1, 2], '3a': [1, 2]}, how='most_common') ({2: [1, 2, 3]}, {3: [1, 2], '3a': [1, 2]}) """ if len(data) == 0: return data, {} lengths = list(map(len, data.values())) if how == "max_length": l_csv = max(lengths) elif how == "most_common": l_csv = max(set(lengths), key=lengths.count) else: raise ValueError("Unknown value for how: {}".format(how)) csv_data = {} meta_data = {} for k, v in data.items(): if len(v) == l_csv: csv_data.update({k: v}) else: meta_data.update({k: v}) return meta_data, csv_data
class OpenEpdaDataDumper(object): def __init__(self, csv_data="max_length", float_format=None): super().__init__() self._csv_data = csv_data self._float_format = float_format self._encoding = DFLT_DATA_FILE_ENCODING def write(self, stream, **data): """ Parameters ---------- stream : Any A stream supporting write method. For compatibility with the data standard, must be in UTF-8 encoding. data : dict Data to be dumped into the stream. """ yaml_data = { "_timestamp": datetime.now().isoformat(), "_openEPDA_version": "0.2", } array_data = {} for k, v in data.items(): if is_array_like(v): try: l = v.tolist() except: l = list(v) array_data.update({k: l}) else: yaml_data.update({k: v}) meta_data, csv_data_dict = split_array_data( array_data, how=self._csv_data ) yaml_data.update(meta_data) # csv_data = np.array(list(csv_data_dict.values())).T # csv_data_keys = list(csv_data_dict.keys()) # self._l.debug('yaml keys: {}'.format(yaml_data.keys())) # self._l.debug('yaml type: {}'.format(type(yaml_data))) # self._l.debug('csv keys: {}, shape: {}'.format(csv_data_keys, # csv_data.shape)) stream.write("# OpenEPDA Data Format\n") yaml = YAML() yaml.default_flow_style = False yaml.explicit_end = None yaml.dump(yaml_data, stream) stream.write("...\n") df_csv = pd.DataFrame(csv_data_dict) # np.savetxt(stream, csv_data, delimiter=',', # header=','.join(['"{}"'.format(k) for k in csv_data_keys]), # comments='', fmt=self._float_format) df_csv.to_csv( stream, sep=",", quoting=QUOTE_NONNUMERIC, decimal=".", index=False, float_format=self._float_format, encoding=self._encoding, )