Source code for astropy.table.column

# Licensed under a 3-clause BSD style license - see LICENSE.rst

import warnings
import weakref

from copy import deepcopy

import numpy as np
from numpy import ma

from astropy.units import Unit, Quantity
from astropy.utils.console import color_print
from astropy.utils.metadata import MetaData
from astropy.utils.data_info import BaseColumnInfo, dtype_info_name
from astropy.utils.misc import dtype_bytes_or_chars
from . import groups
from . import pprint
from .np_utils import fix_column_name

# These "shims" provide __getitem__ implementations for Column and MaskedColumn
from ._column_mixins import _ColumnGetitemShim, _MaskedColumnGetitemShim

# Create a generic TableFormatter object for use by bare columns with no
# parent table.
FORMATTER = pprint.TableFormatter()


[docs]class StringTruncateWarning(UserWarning): """ Warning class for when a string column is assigned a value that gets truncated because the base (numpy) string length is too short. This does not inherit from AstropyWarning because we want to use stacklevel=2 to show the user where the issue occurred in their code. """ pass
# Always emit this warning, not just the first instance warnings.simplefilter('always', StringTruncateWarning) def _auto_names(n_cols): from . import conf return [str(conf.auto_colname).format(i) for i in range(n_cols)] # list of one and two-dimensional comparison functions, which sometimes return # a Column class and sometimes a plain array. Used in __array_wrap__ to ensure # they only return plain (masked) arrays (see #1446 and #1685) _comparison_functions = set( [np.greater, np.greater_equal, np.less, np.less_equal, np.not_equal, np.equal, np.isfinite, np.isinf, np.isnan, np.sign, np.signbit]) def col_copy(col, copy_indices=True): """ Mixin-safe version of Column.copy() (with copy_data=True). Parameters ---------- col : Column or mixin column Input column copy_indices : bool Copy the column ``indices`` attribute Returns ------- col : Copy of input column """ if isinstance(col, BaseColumn): return col.copy() # The new column should have None for the parent_table ref. If the # original parent_table weakref there at the point of copying then it # generates an infinite recursion. Instead temporarily remove the weakref # on the original column and restore after the copy in an exception-safe # manner. parent_table = col.info.parent_table indices = col.info.indices col.info.parent_table = None col.info.indices = [] try: newcol = col.copy() if hasattr(col, 'copy') else deepcopy(col) newcol.info = col.info newcol.info.indices = deepcopy(indices or []) if copy_indices else [] for index in newcol.info.indices: index.replace_col(col, newcol) finally: col.info.parent_table = parent_table col.info.indices = indices return newcol class FalseArray(np.ndarray): """ Boolean mask array that is always False. This is used to create a stub ``mask`` property which is a boolean array of ``False`` used by default for mixin columns and corresponding to the mixin column data shape. The ``mask`` looks like a normal numpy array but an exception will be raised if ``True`` is assigned to any element. The consequences of the limitation are most obvious in the high-level table operations. Parameters ---------- shape : tuple Data shape """ def __new__(cls, shape): obj = np.zeros(shape, dtype=bool).view(cls) return obj def __setitem__(self, item, val): val = np.asarray(val) if np.any(val): raise ValueError('Cannot set any element of {} class to True' .format(self.__class__.__name__)) def _expand_string_array_for_values(arr, values): """ For string-dtype return a version of ``arr`` that is wide enough for ``values``. If ``arr`` is not string-dtype or does not need expansion then return ``arr``. Parameters ---------- arr : np.ndarray Input array values : scalar or array_like Values for width comparison for string arrays Returns ------- arr_expanded : np.ndarray """ if arr.dtype.kind in ('U', 'S') and values is not np.ma.masked: # Find the length of the longest string in the new values. values_str_len = np.char.str_len(values).max() # Determine character repeat count of arr.dtype. Returns a positive # int or None (something like 'U0' is not possible in numpy). If new values # are longer than current then make a new (wider) version of arr. arr_str_len = dtype_bytes_or_chars(arr.dtype) if arr_str_len and values_str_len > arr_str_len: arr_dtype = arr.dtype.byteorder + arr.dtype.kind + str(values_str_len) arr = arr.astype(arr_dtype) return arr
[docs]class ColumnInfo(BaseColumnInfo): """ Container for meta information like name, description, format. This is required when the object is used as a mixin column within a table, but can be used as a general way to store meta information. """ attrs_from_parent = BaseColumnInfo.attr_names _supports_indexing = True def new_like(self, cols, length, metadata_conflicts='warn', name=None): """ Return a new Column instance which is consistent with the input ``cols`` and has ``length`` rows. This is intended for creating an empty column object whose elements can be set in-place for table operations like join or vstack. Parameters ---------- cols : list List of input columns length : int Length of the output column object metadata_conflicts : str ('warn'|'error'|'silent') How to handle metadata conflicts name : str Output column name Returns ------- col : Column (or subclass) New instance of this class consistent with ``cols`` """ attrs = self.merge_cols_attributes(cols, metadata_conflicts, name, ('meta', 'unit', 'format', 'description')) return self._parent_cls(length=length, **attrs) def get_sortable_arrays(self): """ Return a list of arrays which can be lexically sorted to represent the order of the parent column. For Column this is just the column itself. Returns ------- arrays : list of ndarray """ return [self._parent]
class BaseColumn(_ColumnGetitemShim, np.ndarray): meta = MetaData() def __new__(cls, data=None, name=None, dtype=None, shape=(), length=0, description=None, unit=None, format=None, meta=None, copy=False, copy_indices=True): if data is None: dtype = (np.dtype(dtype).str, shape) self_data = np.zeros(length, dtype=dtype) elif isinstance(data, BaseColumn) and hasattr(data, '_name'): # When unpickling a MaskedColumn, ``data`` will be a bare # BaseColumn with none of the expected attributes. In this case # do NOT execute this block which initializes from ``data`` # attributes. self_data = np.array(data.data, dtype=dtype, copy=copy) if description is None: description = data.description if unit is None: unit = unit or data.unit if format is None: format = data.format if meta is None: meta = data.meta if name is None: name = data.name elif isinstance(data, Quantity): if unit is None: self_data = np.array(data, dtype=dtype, copy=copy) unit = data.unit else: self_data = np.array(data.to(unit), dtype=dtype, copy=copy) if description is None: description = data.info.description if format is None: format = data.info.format if meta is None: meta = data.info.meta else: if np.dtype(dtype).char == 'S': data = cls._encode_str(data) self_data = np.array(data, dtype=dtype, copy=copy) self = self_data.view(cls) self._name = fix_column_name(name) self._parent_table = None self.unit = unit self._format = format self.description = description self.meta = meta self.indices = deepcopy(getattr(data, 'indices', [])) if copy_indices else [] for index in self.indices: index.replace_col(data, self) return self @property def data(self): return self.view(np.ndarray) @property def parent_table(self): # Note: It seems there are some cases where _parent_table is not set, # such after restoring from a pickled Column. Perhaps that should be # fixed, but this is also okay for now. if getattr(self, '_parent_table', None) is None: return None else: return self._parent_table() @parent_table.setter def parent_table(self, table): if table is None: self._parent_table = None else: self._parent_table = weakref.ref(table) info = ColumnInfo() def copy(self, order='C', data=None, copy_data=True): """ Return a copy of the current instance. If ``data`` is supplied then a view (reference) of ``data`` is used, and ``copy_data`` is ignored. Parameters ---------- order : {'C', 'F', 'A', 'K'}, optional Controls the memory layout of the copy. 'C' means C-order, 'F' means F-order, 'A' means 'F' if ``a`` is Fortran contiguous, 'C' otherwise. 'K' means match the layout of ``a`` as closely as possible. (Note that this function and :func:numpy.copy are very similar, but have different default values for their order= arguments.) Default is 'C'. data : array, optional If supplied then use a view of ``data`` instead of the instance data. This allows copying the instance attributes and meta. copy_data : bool, optional Make a copy of the internal numpy array instead of using a reference. Default is True. Returns ------- col : Column or MaskedColumn Copy of the current column (same type as original) """ if data is None: data = self.data if copy_data: data = data.copy(order) out = data.view(self.__class__) out.__array_finalize__(self) # If there is meta on the original column then deepcopy (since "copy" of column # implies complete independence from original). __array_finalize__ will have already # made a light copy. I'm not sure how to avoid that initial light copy. if self.meta is not None: out.meta = self.meta # MetaData descriptor does a deepcopy here # for MaskedColumn, MaskedArray.__array_finalize__ also copies mask # from self, which is not the idea here, so undo if isinstance(self, MaskedColumn): out._mask = data._mask self._copy_groups(out) return out def __setstate__(self, state): """ Restore the internal state of the Column/MaskedColumn for pickling purposes. This requires that the last element of ``state`` is a 5-tuple that has Column-specific state values. """ # Get the Column attributes names = ('_name', '_unit', '_format', 'description', 'meta', 'indices') attrs = {name: val for name, val in zip(names, state[-1])} state = state[:-1] # Using super().__setstate__(state) gives # "TypeError 'int' object is not iterable", raised in # astropy.table._column_mixins._ColumnGetitemShim.__setstate_cython__() # Previously, it seems to have given an infinite recursion. # Hence, manually call the right super class to actually set up # the array object. super_class = ma.MaskedArray if isinstance(self, ma.MaskedArray) else np.ndarray super_class.__setstate__(self, state) # Set the Column attributes for name, val in attrs.items(): setattr(self, name, val) self._parent_table = None def __reduce__(self): """ Return a 3-tuple for pickling a Column. Use the super-class functionality but then add in a 5-tuple of Column-specific values that get used in __setstate__. """ super_class = ma.MaskedArray if isinstance(self, ma.MaskedArray) else np.ndarray reconstruct_func, reconstruct_func_args, state = super_class.__reduce__(self) # Define Column-specific attrs and meta that gets added to state. column_state = (self.name, self.unit, self.format, self.description, self.meta, self.indices) state = state + (column_state,) return reconstruct_func, reconstruct_func_args, state def __array_finalize__(self, obj): # Obj will be none for direct call to Column() creator if obj is None: return if callable(super().__array_finalize__): super().__array_finalize__(obj) # Self was created from template (e.g. obj[slice] or (obj * 2)) # or viewcast e.g. obj.view(Column). In either case we want to # init Column attributes for self from obj if possible. self.parent_table = None if not hasattr(self, 'indices'): # may have been copied in __new__ self.indices = [] self._copy_attrs(obj) def __array_wrap__(self, out_arr, context=None): """ __array_wrap__ is called at the end of every ufunc. Normally, we want a Column object back and do not have to do anything special. But there are two exceptions: 1) If the output shape is different (e.g. for reduction ufuncs like sum() or mean()), a Column still linking to a parent_table makes little sense, so we return the output viewed as the column content (ndarray or MaskedArray). For this case, we use "[()]" to select everything, and to ensure we convert a zero rank array to a scalar. (For some reason np.sum() returns a zero rank scalar array while np.mean() returns a scalar; So the [()] is needed for this case. 2) When the output is created by any function that returns a boolean we also want to consistently return an array rather than a column (see #1446 and #1685) """ out_arr = super().__array_wrap__(out_arr, context) if (self.shape != out_arr.shape or (isinstance(out_arr, BaseColumn) and (context is not None and context[0] in _comparison_functions))): return out_arr.data[()] else: return out_arr @property def name(self): """ The name of this column. """ return self._name @name.setter def name(self, val): val = fix_column_name(val) if self.parent_table is not None: table = self.parent_table table.columns._rename_column(self.name, val) self._name = val @property def format(self): """ Format string for displaying values in this column. """ return self._format @format.setter def format(self, format_string): prev_format = getattr(self, '_format', None) self._format = format_string # set new format string try: # test whether it formats without error exemplarily self.pformat(max_lines=1) except Exception as err: # revert to restore previous format if there was one self._format = prev_format raise ValueError( "Invalid format for column '{}': could not display " "values in this column using this format ({})".format( self.name, err.args[0])) @property def descr(self): """Array-interface compliant full description of the column. This returns a 3-tuple (name, type, shape) that can always be used in a structured array dtype definition. """ return (self.name, self.dtype.str, self.shape[1:]) def iter_str_vals(self): """ Return an iterator that yields the string-formatted values of this column. Returns ------- str_vals : iterator Column values formatted as strings """ # Iterate over formatted values with no max number of lines, no column # name, no unit, and ignoring the returned header info in outs. _pformat_col_iter = self._formatter._pformat_col_iter for str_val in _pformat_col_iter(self, -1, show_name=False, show_unit=False, show_dtype=False, outs={}): yield str_val def attrs_equal(self, col): """Compare the column attributes of ``col`` to this object. The comparison attributes are: ``name``, ``unit``, ``dtype``, ``format``, ``description``, and ``meta``. Parameters ---------- col : Column Comparison column Returns ------- equal : bool True if all attributes are equal """ if not isinstance(col, BaseColumn): raise ValueError('Comparison `col` must be a Column or ' 'MaskedColumn object') attrs = ('name', 'unit', 'dtype', 'format', 'description', 'meta') equal = all(getattr(self, x) == getattr(col, x) for x in attrs) return equal @property def _formatter(self): return FORMATTER if (self.parent_table is None) else self.parent_table.formatter def pformat(self, max_lines=None, show_name=True, show_unit=False, show_dtype=False, html=False): """Return a list of formatted string representation of column values. If no value of ``max_lines`` is supplied then the height of the screen terminal is used to set ``max_lines``. If the terminal height cannot be determined then the default will be determined using the ``astropy.conf.max_lines`` configuration item. If a negative value of ``max_lines`` is supplied then there is no line limit applied. Parameters ---------- max_lines : int Maximum lines of output (header + data rows) show_name : bool Include column name. Default is True. show_unit : bool Include a header row for unit. Default is False. show_dtype : bool Include column dtype. Default is False. html : bool Format the output as an HTML table. Default is False. Returns ------- lines : list List of lines with header and formatted column values """ _pformat_col = self._formatter._pformat_col lines, outs = _pformat_col(self, max_lines, show_name=show_name, show_unit=show_unit, show_dtype=show_dtype, html=html) return lines def pprint(self, max_lines=None, show_name=True, show_unit=False, show_dtype=False): """Print a formatted string representation of column values. If no value of ``max_lines`` is supplied then the height of the screen terminal is used to set ``max_lines``. If the terminal height cannot be determined then the default will be determined using the ``astropy.conf.max_lines`` configuration item. If a negative value of ``max_lines`` is supplied then there is no line limit applied. Parameters ---------- max_lines : int Maximum number of values in output show_name : bool Include column name. Default is True. show_unit : bool Include a header row for unit. Default is False. show_dtype : bool Include column dtype. Default is True. """ _pformat_col = self._formatter._pformat_col lines, outs = _pformat_col(self, max_lines, show_name=show_name, show_unit=show_unit, show_dtype=show_dtype) n_header = outs['n_header'] for i, line in enumerate(lines): if i < n_header: color_print(line, 'red') else: print(line) def more(self, max_lines=None, show_name=True, show_unit=False): """Interactively browse column with a paging interface. Supported keys:: f, <space> : forward one page b : back one page r : refresh same page n : next row p : previous row < : go to beginning > : go to end q : quit browsing h : print this help Parameters ---------- max_lines : int Maximum number of lines in table output. show_name : bool Include a header row for column names. Default is True. show_unit : bool Include a header row for unit. Default is False. """ _more_tabcol = self._formatter._more_tabcol _more_tabcol(self, max_lines=max_lines, show_name=show_name, show_unit=show_unit) @property def unit(self): """ The unit associated with this column. May be a string or a `astropy.units.UnitBase` instance. Setting the ``unit`` property does not change the values of the data. To perform a unit conversion, use ``convert_unit_to``. """ return self._unit @unit.setter def unit(self, unit): if unit is None: self._unit = None else: self._unit = Unit(unit, parse_strict='silent') @unit.deleter def unit(self): self._unit = None def convert_unit_to(self, new_unit, equivalencies=[]): """ Converts the values of the column in-place from the current unit to the given unit. To change the unit associated with this column without actually changing the data values, simply set the ``unit`` property. Parameters ---------- new_unit : str or `astropy.units.UnitBase` instance The unit to convert to. equivalencies : list of equivalence pairs, optional A list of equivalence pairs to try if the unit are not directly convertible. See :ref:`unit_equivalencies`. Raises ------ astropy.units.UnitsError If units are inconsistent """ if self.unit is None: raise ValueError("No unit set on column") self.data[:] = self.unit.to( new_unit, self.data, equivalencies=equivalencies) self.unit = new_unit @property def groups(self): if not hasattr(self, '_groups'): self._groups = groups.ColumnGroups(self) return self._groups def group_by(self, keys): """ Group this column by the specified ``keys`` This effectively splits the column into groups which correspond to unique values of the ``keys`` grouping object. The output is a new `Column` or `MaskedColumn` which contains a copy of this column but sorted by row according to ``keys``. The ``keys`` input to ``group_by`` must be a numpy array with the same length as this column. Parameters ---------- keys : numpy array Key grouping object Returns ------- out : Column New column with groups attribute set accordingly """ return groups.column_group_by(self, keys) def _copy_groups(self, out): """ Copy current groups into a copy of self ``out`` """ if self.parent_table: if hasattr(self.parent_table, '_groups'): out._groups = groups.ColumnGroups(out, indices=self.parent_table._groups._indices) elif hasattr(self, '_groups'): out._groups = groups.ColumnGroups(out, indices=self._groups._indices) # Strip off the BaseColumn-ness for repr and str so that # MaskedColumn.data __repr__ does not include masked_BaseColumn(data = # [1 2], ...). def __repr__(self): return np.asarray(self).__repr__() @property def quantity(self): """ A view of this table column as a `~astropy.units.Quantity` object with units given by the Column's `unit` parameter. """ # the Quantity initializer is used here because it correctly fails # if the column's values are non-numeric (like strings), while .view # will happily return a quantity with gibberish for numerical values return Quantity(self, self.unit, copy=False, dtype=self.dtype, order='A', subok=True) def to(self, unit, equivalencies=[], **kwargs): """ Converts this table column to a `~astropy.units.Quantity` object with the requested units. Parameters ---------- unit : `~astropy.units.Unit` or str The unit to convert to (i.e., a valid argument to the :meth:`astropy.units.Quantity.to` method). equivalencies : list of equivalence pairs, optional Equivalencies to use for this conversion. See :meth:`astropy.units.Quantity.to` for more details. Returns ------- quantity : `~astropy.units.Quantity` A quantity object with the contents of this column in the units ``unit``. """ return self.quantity.to(unit, equivalencies) def _copy_attrs(self, obj): """ Copy key column attributes from ``obj`` to self """ for attr in ('name', 'unit', '_format', 'description'): val = getattr(obj, attr, None) setattr(self, attr, val) # Light copy of meta if it is not empty obj_meta = getattr(obj, 'meta', None) if obj_meta: self.meta = obj_meta.copy() @staticmethod def _encode_str(value): """ Encode anything that is unicode-ish as utf-8. This method is only called for Py3+. """ if isinstance(value, str): value = value.encode('utf-8') elif isinstance(value, bytes) or value is np.ma.masked: pass else: arr = np.asarray(value) if arr.dtype.char == 'U': arr = np.char.encode(arr, encoding='utf-8') if isinstance(value, np.ma.MaskedArray): arr = np.ma.array(arr, mask=value.mask, copy=False) value = arr return value def tolist(self): if self.dtype.kind == 'S': return np.chararray.decode(self, encoding='utf-8').tolist() else: return super().tolist()
[docs]class Column(BaseColumn): """Define a data column for use in a Table object. Parameters ---------- data : list, ndarray or None Column data values name : str Column name and key for reference within Table dtype : numpy.dtype compatible value Data type for column shape : tuple or () Dimensions of a single row element in the column data length : int or 0 Number of row elements in column data description : str or None Full description of column unit : str or None Physical unit format : str or None or function or callable Format string for outputting column values. This can be an "old-style" (``format % value``) or "new-style" (`str.format`) format specification string or a function or any callable object that accepts a single value and returns a string. meta : dict-like or None Meta-data associated with the column Examples -------- A Column can be created in two different ways: - Provide a ``data`` value but not ``shape`` or ``length`` (which are inferred from the data). Examples:: col = Column(data=[1, 2], name='name') # shape=(2,) col = Column(data=[[1, 2], [3, 4]], name='name') # shape=(2, 2) col = Column(data=[1, 2], name='name', dtype=float) col = Column(data=np.array([1, 2]), name='name') col = Column(data=['hello', 'world'], name='name') The ``dtype`` argument can be any value which is an acceptable fixed-size data-type initializer for the numpy.dtype() method. See `<https://docs.scipy.org/doc/numpy/reference/arrays.dtypes.html>`_. Examples include: - Python non-string type (float, int, bool) - Numpy non-string type (e.g. np.float32, np.int64, np.bool\\_) - Numpy.dtype array-protocol type strings (e.g. 'i4', 'f8', 'S15') If no ``dtype`` value is provide then the type is inferred using ``np.array(data)``. - Provide ``length`` and optionally ``shape``, but not ``data`` Examples:: col = Column(name='name', length=5) col = Column(name='name', dtype=int, length=10, shape=(3,4)) The default ``dtype`` is ``np.float64``. The ``shape`` argument is the array shape of a single cell in the column. """ def __new__(cls, data=None, name=None, dtype=None, shape=(), length=0, description=None, unit=None, format=None, meta=None, copy=False, copy_indices=True): if isinstance(data, MaskedColumn) and np.any(data.mask): raise TypeError("Cannot convert a MaskedColumn with masked value to a Column") self = super().__new__( cls, data=data, name=name, dtype=dtype, shape=shape, length=length, description=description, unit=unit, format=format, meta=meta, copy=copy, copy_indices=copy_indices) return self def __setattr__(self, item, value): if not isinstance(self, MaskedColumn) and item == "mask": raise AttributeError("cannot set mask value to a column in non-masked Table") super().__setattr__(item, value) if item == 'unit' and issubclass(self.dtype.type, np.number): try: converted = self.parent_table._convert_col_for_table(self) except AttributeError: # Either no parent table or parent table is None pass else: if converted is not self: self.parent_table.replace_column(self.name, converted) def _base_repr_(self, html=False): # If scalar then just convert to correct numpy type and use numpy repr if self.ndim == 0: return repr(self.item()) descr_vals = [self.__class__.__name__] unit = None if self.unit is None else str(self.unit) shape = None if self.ndim <= 1 else self.shape[1:] for attr, val in (('name', self.name), ('dtype', dtype_info_name(self.dtype)), ('shape', shape), ('unit', unit), ('format', self.format), ('description', self.description), ('length', len(self))): if val is not None: descr_vals.append(f'{attr}={val!r}') descr = '<' + ' '.join(descr_vals) + '>\n' if html: from astropy.utils.xml.writer import xml_escape descr = xml_escape(descr) data_lines, outs = self._formatter._pformat_col( self, show_name=False, show_unit=False, show_length=False, html=html) out = descr + '\n'.join(data_lines) return out def _repr_html_(self): return self._base_repr_(html=True) def __repr__(self): return self._base_repr_(html=False) def __str__(self): # If scalar then just convert to correct numpy type and use numpy repr if self.ndim == 0: return str(self.item()) lines, outs = self._formatter._pformat_col(self) return '\n'.join(lines) def __bytes__(self): return str(self).encode('utf-8') def _check_string_truncate(self, value): """ Emit a warning if any elements of ``value`` will be truncated when ``value`` is assigned to self. """ # Convert input ``value`` to the string dtype of this column and # find the length of the longest string in the array. value = np.asanyarray(value, dtype=self.dtype.type) if value.size == 0: return value_str_len = np.char.str_len(value).max() # Parse the array-protocol typestring (e.g. '|U15') of self.dtype which # has the character repeat count on the right side. self_str_len = dtype_bytes_or_chars(self.dtype) if value_str_len > self_str_len: warnings.warn('truncated right side string(s) longer than {} ' 'character(s) during assignment' .format(self_str_len), StringTruncateWarning, stacklevel=3) def __setitem__(self, index, value): if self.dtype.char == 'S': value = self._encode_str(value) # Issue warning for string assignment that truncates ``value`` if issubclass(self.dtype.type, np.character): self._check_string_truncate(value) # update indices self.info.adjust_indices(index, value, len(self)) # Set items using a view of the underlying data, as it gives an # order-of-magnitude speed-up. [#2994] self.data[index] = value def _make_compare(oper): """ Make comparison methods which encode the ``other`` object to utf-8 in the case of a bytestring dtype for Py3+. """ swapped_oper = {'__eq__': '__eq__', '__ne__': '__ne__', '__gt__': '__lt__', '__lt__': '__gt__', '__ge__': '__le__', '__le__': '__ge__'}[oper] def _compare(self, other): op = oper # copy enclosed ref to allow swap below # Special case to work around #6838. Other combinations work OK, # see tests.test_column.test_unicode_sandwich_compare(). In this # case just swap self and other. # # This is related to an issue in numpy that was addressed in np 1.13. # However that fix does not make this problem go away, but maybe # future numpy versions will do so. NUMPY_LT_1_13 to get the # attention of future maintainers to check (by deleting or versioning # the if block below). See #6899 discussion. # 2019-06-21: still needed with numpy 1.16. if (isinstance(self, MaskedColumn) and self.dtype.kind == 'U' and isinstance(other, MaskedColumn) and other.dtype.kind == 'S'): self, other = other, self op = swapped_oper if self.dtype.char == 'S': other = self._encode_str(other) # Now just let the regular ndarray.__eq__, etc., take over. result = getattr(super(), op)(other) # But we should not return Column instances for this case. return result.data if isinstance(result, Column) else result return _compare __eq__ = _make_compare('__eq__') __ne__ = _make_compare('__ne__') __gt__ = _make_compare('__gt__') __lt__ = _make_compare('__lt__') __ge__ = _make_compare('__ge__') __le__ = _make_compare('__le__')
[docs] def insert(self, obj, values, axis=0): """ Insert values before the given indices in the column and return a new `~astropy.table.Column` object. Parameters ---------- obj : int, slice or sequence of ints Object that defines the index or indices before which ``values`` is inserted. values : array_like Value(s) to insert. If the type of ``values`` is different from that of the column, ``values`` is converted to the matching type. ``values`` should be shaped so that it can be broadcast appropriately. axis : int, optional Axis along which to insert ``values``. If ``axis`` is None then the column array is flattened before insertion. Default is 0, which will insert a row. Returns ------- out : `~astropy.table.Column` A copy of column with ``values`` and ``mask`` inserted. Note that the insertion does not occur in-place: a new column is returned. """ if self.dtype.kind == 'O': # Even if values is array-like (e.g. [1,2,3]), insert as a single # object. Numpy.insert instead inserts each element in an array-like # input individually. data = np.insert(self, obj, None, axis=axis) data[obj] = values else: self_for_insert = _expand_string_array_for_values(self, values) data = np.insert(self_for_insert, obj, values, axis=axis) out = data.view(self.__class__) out.__array_finalize__(self) return out
# We do this to make the methods show up in the API docs name = BaseColumn.name unit = BaseColumn.unit copy = BaseColumn.copy more = BaseColumn.more pprint = BaseColumn.pprint pformat = BaseColumn.pformat convert_unit_to = BaseColumn.convert_unit_to quantity = BaseColumn.quantity to = BaseColumn.to
class MaskedColumnInfo(ColumnInfo): """ Container for meta information like name, description, format. This is required when the object is used as a mixin column within a table, but can be used as a general way to store meta information. In this case it just adds the ``mask_val`` attribute. """ # Add `serialize_method` attribute to the attrs that MaskedColumnInfo knows # about. This allows customization of the way that MaskedColumn objects # get written to file depending on format. The default is to use whatever # the writer would normally do, which in the case of FITS or ECSV is to use # a NULL value within the data itself. If serialize_method is 'data_mask' # then the mask is explicitly written out as a separate column if there # are any masked values. See also code below. attr_names = ColumnInfo.attr_names | {'serialize_method'} # When `serialize_method` is 'data_mask', and data and mask are being written # as separate columns, use column names <name> and <name>.mask (instead # of default encoding as <name>.data and <name>.mask). _represent_as_dict_primary_data = 'data' mask_val = np.ma.masked def __init__(self, bound=False): super().__init__(bound) # If bound to a data object instance then create the dict of attributes # which stores the info attribute values. if bound: # Specify how to serialize this object depending on context. self.serialize_method = {'fits': 'null_value', 'ecsv': 'null_value', 'hdf5': 'data_mask', None: 'null_value'} def _represent_as_dict(self): out = super()._represent_as_dict() col = self._parent # If the serialize method for this context (e.g. 'fits' or 'ecsv') is # 'data_mask', that means to serialize using an explicit mask column. method = self.serialize_method[self._serialize_context] if method == 'data_mask': # Note: a driver here is a performance issue in #8443 where repr() of a # np.ma.MaskedArray value is up to 10 times slower than repr of a normal array # value. So regardless of whether there are masked elements it is useful to # explicitly define this as a serialized column and use col.data.data (ndarray) # instead of letting it fall through to the "standard" serialization machinery. out['data'] = col.data.data if np.any(col.mask): # Only if there are actually masked elements do we add the ``mask`` column out['mask'] = col.mask elif method == 'null_value': pass else: raise ValueError('serialize method must be either "data_mask" or "null_value"') return out
[docs]class MaskedColumn(Column, _MaskedColumnGetitemShim, ma.MaskedArray): """Define a masked data column for use in a Table object. Parameters ---------- data : list, ndarray or None Column data values name : str Column name and key for reference within Table mask : list, ndarray or None Boolean mask for which True indicates missing or invalid data fill_value : float, int, str or None Value used when filling masked column elements dtype : numpy.dtype compatible value Data type for column shape : tuple or () Dimensions of a single row element in the column data length : int or 0 Number of row elements in column data description : str or None Full description of column unit : str or None Physical unit format : str or None or function or callable Format string for outputting column values. This can be an "old-style" (``format % value``) or "new-style" (`str.format`) format specification string or a function or any callable object that accepts a single value and returns a string. meta : dict-like or None Meta-data associated with the column Examples -------- A MaskedColumn is similar to a Column except that it includes ``mask`` and ``fill_value`` attributes. It can be created in two different ways: - Provide a ``data`` value but not ``shape`` or ``length`` (which are inferred from the data). Examples:: col = MaskedColumn(data=[1, 2], name='name') col = MaskedColumn(data=[1, 2], name='name', mask=[True, False]) col = MaskedColumn(data=[1, 2], name='name', dtype=float, fill_value=99) The ``mask`` argument will be cast as a boolean array and specifies which elements are considered to be missing or invalid. The ``dtype`` argument can be any value which is an acceptable fixed-size data-type initializer for the numpy.dtype() method. See `<https://docs.scipy.org/doc/numpy/reference/arrays.dtypes.html>`_. Examples include: - Python non-string type (float, int, bool) - Numpy non-string type (e.g. np.float32, np.int64, np.bool\\_) - Numpy.dtype array-protocol type strings (e.g. 'i4', 'f8', 'S15') If no ``dtype`` value is provide then the type is inferred using ``np.array(data)``. When ``data`` is provided then the ``shape`` and ``length`` arguments are ignored. - Provide ``length`` and optionally ``shape``, but not ``data`` Examples:: col = MaskedColumn(name='name', length=5) col = MaskedColumn(name='name', dtype=int, length=10, shape=(3,4)) The default ``dtype`` is ``np.float64``. The ``shape`` argument is the array shape of a single cell in the column. """ info = MaskedColumnInfo() def __new__(cls, data=None, name=None, mask=None, fill_value=None, dtype=None, shape=(), length=0, description=None, unit=None, format=None, meta=None, copy=False, copy_indices=True): if mask is None: # If mask is None then we need to determine the mask (if any) from the data. # The naive method is looking for a mask attribute on data, but this can fail, # see #8816. Instead use ``MaskedArray`` to do the work. mask = ma.MaskedArray(data).mask if mask is np.ma.nomask: # Handle odd-ball issue with np.ma.nomask (numpy #13758), and see below. mask = False elif copy: mask = mask.copy() elif mask is np.ma.nomask: # Force the creation of a full mask array as nomask is tricky to # use and will fail in an unexpected manner when setting a value # to the mask. mask = False else: mask = deepcopy(mask) # Create self using MaskedArray as a wrapper class, following the example of # class MSubArray in # https://github.com/numpy/numpy/blob/maintenance/1.8.x/numpy/ma/tests/test_subclassing.py # This pattern makes it so that __array_finalize__ is called as expected (e.g. #1471 and # https://github.com/astropy/astropy/commit/ff6039e8) # First just pass through all args and kwargs to BaseColumn, then wrap that object # with MaskedArray. self_data = BaseColumn(data, dtype=dtype, shape=shape, length=length, name=name, unit=unit, format=format, description=description, meta=meta, copy=copy, copy_indices=copy_indices) self = ma.MaskedArray.__new__(cls, data=self_data, mask=mask) # Note: do not set fill_value in the MaskedArray constructor because this does not # go through the fill_value workarounds. if fill_value is None and getattr(data, 'fill_value', None) is not None: # Coerce the fill_value to the correct type since `data` may be a # different dtype than self. fill_value = np.array(data.fill_value, self.dtype)[()] self.fill_value = fill_value self.parent_table = None # needs to be done here since self doesn't come from BaseColumn.__new__ for index in self.indices: index.replace_col(self_data, self) return self @property def fill_value(self): return self.get_fill_value() # defer to native ma.MaskedArray method @fill_value.setter def fill_value(self, val): """Set fill value both in the masked column view and in the parent table if it exists. Setting one or the other alone doesn't work.""" # another ma bug workaround: If the value of fill_value for a string array is # requested but not yet set then it gets created as 'N/A'. From this point onward # any new fill_values are truncated to 3 characters. Note that this does not # occur if the masked array is a structured array (as in the previous block that # deals with the parent table). # # >>> x = ma.array(['xxxx']) # >>> x.fill_value # fill_value now gets represented as an 'S3' array # 'N/A' # >>> x.fill_value='yyyy' # >>> x.fill_value # 'yyy' # # To handle this we are forced to reset a private variable first: self._fill_value = None self.set_fill_value(val) # defer to native ma.MaskedArray method @property def data(self): """The plain MaskedArray data held by this column.""" out = self.view(np.ma.MaskedArray) # By default, a MaskedArray view will set the _baseclass to be the # same as that of our own class, i.e., BaseColumn. Since we want # to return a plain MaskedArray, we reset the baseclass accordingly. out._baseclass = np.ndarray return out
[docs] def filled(self, fill_value=None): """Return a copy of self, with masked values filled with a given value. Parameters ---------- fill_value : scalar; optional The value to use for invalid entries (`None` by default). If `None`, the ``fill_value`` attribute of the array is used instead. Returns ------- filled_column : Column A copy of ``self`` with masked entries replaced by `fill_value` (be it the function argument or the attribute of ``self``). """ if fill_value is None: fill_value = self.fill_value data = super().filled(fill_value) # Use parent table definition of Column if available column_cls = self.parent_table.Column if (self.parent_table is not None) else Column out = column_cls(name=self.name, data=data, unit=self.unit, format=self.format, description=self.description, meta=deepcopy(self.meta)) return out
[docs] def insert(self, obj, values, mask=None, axis=0): """ Insert values along the given axis before the given indices and return a new `~astropy.table.MaskedColumn` object. Parameters ---------- obj : int, slice or sequence of ints Object that defines the index or indices before which ``values`` is inserted. values : array_like Value(s) to insert. If the type of ``values`` is different from that of the column, ``values`` is converted to the matching type. ``values`` should be shaped so that it can be broadcast appropriately. mask : bool or array_like Mask value(s) to insert. If not supplied, and values does not have a mask either, then False is used. axis : int, optional Axis along which to insert ``values``. If ``axis`` is None then the column array is flattened before insertion. Default is 0, which will insert a row. Returns ------- out : `~astropy.table.MaskedColumn` A copy of column with ``values`` and ``mask`` inserted. Note that the insertion does not occur in-place: a new masked column is returned. """ self_ma = self.data # self viewed as MaskedArray if self.dtype.kind == 'O': # Even if values is array-like (e.g. [1,2,3]), insert as a single # object. Numpy.insert instead inserts each element in an array-like # input individually. new_data = np.insert(self_ma.data, obj, None, axis=axis) new_data[obj] = values else: self_ma = _expand_string_array_for_values(self_ma, values) new_data = np.insert(self_ma.data, obj, values, axis=axis) if mask is None: mask = getattr(values, 'mask', np.ma.nomask) if mask is np.ma.nomask: if self.dtype.kind == 'O': mask = False else: mask = np.zeros(np.shape(values), dtype=bool) new_mask = np.insert(self_ma.mask, obj, mask, axis=axis) new_ma = np.ma.array(new_data, mask=new_mask, copy=False) out = new_ma.view(self.__class__) out.parent_table = None out.indices = [] out._copy_attrs(self) out.fill_value = self.fill_value return out
def _copy_attrs_slice(self, out): # Fixes issue #3023: when calling getitem with a MaskedArray subclass # the original object attributes are not copied. if out.__class__ is self.__class__: out.parent_table = None # we need this because __getitem__ does a shallow copy of indices if out.indices is self.indices: out.indices = [] out._copy_attrs(self) return out def __setitem__(self, index, value): # Issue warning for string assignment that truncates ``value`` if self.dtype.char == 'S': value = self._encode_str(value) if issubclass(self.dtype.type, np.character): # Account for a bug in np.ma.MaskedArray setitem. # https://github.com/numpy/numpy/issues/8624 value = np.ma.asanyarray(value, dtype=self.dtype.type) # Check for string truncation after filling masked items with # empty (zero-length) string. Note that filled() does not make # a copy if there are no masked items. self._check_string_truncate(value.filled('')) # update indices self.info.adjust_indices(index, value, len(self)) ma.MaskedArray.__setitem__(self, index, value) # We do this to make the methods show up in the API docs name = BaseColumn.name copy = BaseColumn.copy more = BaseColumn.more pprint = BaseColumn.pprint pformat = BaseColumn.pformat convert_unit_to = BaseColumn.convert_unit_to