# Licensed under a 3-clause BSD style license - see LICENSE.rst
import warnings
import weakref
from copy import deepcopy
import numpy as np
from numpy import ma
from astropy.units import Unit, Quantity
from astropy.utils.console import color_print
from astropy.utils.metadata import MetaData
from astropy.utils.data_info import BaseColumnInfo, dtype_info_name
from astropy.utils.misc import dtype_bytes_or_chars
from . import groups
from . import pprint
from .np_utils import fix_column_name
# These "shims" provide __getitem__ implementations for Column and MaskedColumn
from ._column_mixins import _ColumnGetitemShim, _MaskedColumnGetitemShim
# Create a generic TableFormatter object for use by bare columns with no
# parent table.
FORMATTER = pprint.TableFormatter()
[docs]class StringTruncateWarning(UserWarning):
"""
Warning class for when a string column is assigned a value
that gets truncated because the base (numpy) string length
is too short.
This does not inherit from AstropyWarning because we want to use
stacklevel=2 to show the user where the issue occurred in their code.
"""
pass
# Always emit this warning, not just the first instance
warnings.simplefilter('always', StringTruncateWarning)
def _auto_names(n_cols):
from . import conf
return [str(conf.auto_colname).format(i) for i in range(n_cols)]
# list of one and two-dimensional comparison functions, which sometimes return
# a Column class and sometimes a plain array. Used in __array_wrap__ to ensure
# they only return plain (masked) arrays (see #1446 and #1685)
_comparison_functions = set(
[np.greater, np.greater_equal, np.less, np.less_equal,
np.not_equal, np.equal,
np.isfinite, np.isinf, np.isnan, np.sign, np.signbit])
def col_copy(col, copy_indices=True):
"""
Mixin-safe version of Column.copy() (with copy_data=True).
Parameters
----------
col : Column or mixin column
Input column
copy_indices : bool
Copy the column ``indices`` attribute
Returns
-------
col : Copy of input column
"""
if isinstance(col, BaseColumn):
return col.copy()
# The new column should have None for the parent_table ref. If the
# original parent_table weakref there at the point of copying then it
# generates an infinite recursion. Instead temporarily remove the weakref
# on the original column and restore after the copy in an exception-safe
# manner.
parent_table = col.info.parent_table
indices = col.info.indices
col.info.parent_table = None
col.info.indices = []
try:
newcol = col.copy() if hasattr(col, 'copy') else deepcopy(col)
newcol.info = col.info
newcol.info.indices = deepcopy(indices or []) if copy_indices else []
for index in newcol.info.indices:
index.replace_col(col, newcol)
finally:
col.info.parent_table = parent_table
col.info.indices = indices
return newcol
class FalseArray(np.ndarray):
"""
Boolean mask array that is always False.
This is used to create a stub ``mask`` property which is a boolean array of
``False`` used by default for mixin columns and corresponding to the mixin
column data shape. The ``mask`` looks like a normal numpy array but an
exception will be raised if ``True`` is assigned to any element. The
consequences of the limitation are most obvious in the high-level table
operations.
Parameters
----------
shape : tuple
Data shape
"""
def __new__(cls, shape):
obj = np.zeros(shape, dtype=bool).view(cls)
return obj
def __setitem__(self, item, val):
val = np.asarray(val)
if np.any(val):
raise ValueError('Cannot set any element of {} class to True'
.format(self.__class__.__name__))
def _expand_string_array_for_values(arr, values):
"""
For string-dtype return a version of ``arr`` that is wide enough for ``values``.
If ``arr`` is not string-dtype or does not need expansion then return ``arr``.
Parameters
----------
arr : np.ndarray
Input array
values : scalar or array_like
Values for width comparison for string arrays
Returns
-------
arr_expanded : np.ndarray
"""
if arr.dtype.kind in ('U', 'S') and values is not np.ma.masked:
# Find the length of the longest string in the new values.
values_str_len = np.char.str_len(values).max()
# Determine character repeat count of arr.dtype. Returns a positive
# int or None (something like 'U0' is not possible in numpy). If new values
# are longer than current then make a new (wider) version of arr.
arr_str_len = dtype_bytes_or_chars(arr.dtype)
if arr_str_len and values_str_len > arr_str_len:
arr_dtype = arr.dtype.byteorder + arr.dtype.kind + str(values_str_len)
arr = arr.astype(arr_dtype)
return arr
[docs]class ColumnInfo(BaseColumnInfo):
"""
Container for meta information like name, description, format.
This is required when the object is used as a mixin column within a table,
but can be used as a general way to store meta information.
"""
attrs_from_parent = BaseColumnInfo.attr_names
_supports_indexing = True
def new_like(self, cols, length, metadata_conflicts='warn', name=None):
"""
Return a new Column instance which is consistent with the
input ``cols`` and has ``length`` rows.
This is intended for creating an empty column object whose elements can
be set in-place for table operations like join or vstack.
Parameters
----------
cols : list
List of input columns
length : int
Length of the output column object
metadata_conflicts : str ('warn'|'error'|'silent')
How to handle metadata conflicts
name : str
Output column name
Returns
-------
col : Column (or subclass)
New instance of this class consistent with ``cols``
"""
attrs = self.merge_cols_attributes(cols, metadata_conflicts, name,
('meta', 'unit', 'format', 'description'))
return self._parent_cls(length=length, **attrs)
def get_sortable_arrays(self):
"""
Return a list of arrays which can be lexically sorted to represent
the order of the parent column.
For Column this is just the column itself.
Returns
-------
arrays : list of ndarray
"""
return [self._parent]
class BaseColumn(_ColumnGetitemShim, np.ndarray):
meta = MetaData()
def __new__(cls, data=None, name=None,
dtype=None, shape=(), length=0,
description=None, unit=None, format=None, meta=None,
copy=False, copy_indices=True):
if data is None:
dtype = (np.dtype(dtype).str, shape)
self_data = np.zeros(length, dtype=dtype)
elif isinstance(data, BaseColumn) and hasattr(data, '_name'):
# When unpickling a MaskedColumn, ``data`` will be a bare
# BaseColumn with none of the expected attributes. In this case
# do NOT execute this block which initializes from ``data``
# attributes.
self_data = np.array(data.data, dtype=dtype, copy=copy)
if description is None:
description = data.description
if unit is None:
unit = unit or data.unit
if format is None:
format = data.format
if meta is None:
meta = data.meta
if name is None:
name = data.name
elif isinstance(data, Quantity):
if unit is None:
self_data = np.array(data, dtype=dtype, copy=copy)
unit = data.unit
else:
self_data = np.array(data.to(unit), dtype=dtype, copy=copy)
if description is None:
description = data.info.description
if format is None:
format = data.info.format
if meta is None:
meta = data.info.meta
else:
if np.dtype(dtype).char == 'S':
data = cls._encode_str(data)
self_data = np.array(data, dtype=dtype, copy=copy)
self = self_data.view(cls)
self._name = fix_column_name(name)
self._parent_table = None
self.unit = unit
self._format = format
self.description = description
self.meta = meta
self.indices = deepcopy(getattr(data, 'indices', [])) if copy_indices else []
for index in self.indices:
index.replace_col(data, self)
return self
@property
def data(self):
return self.view(np.ndarray)
@property
def parent_table(self):
# Note: It seems there are some cases where _parent_table is not set,
# such after restoring from a pickled Column. Perhaps that should be
# fixed, but this is also okay for now.
if getattr(self, '_parent_table', None) is None:
return None
else:
return self._parent_table()
@parent_table.setter
def parent_table(self, table):
if table is None:
self._parent_table = None
else:
self._parent_table = weakref.ref(table)
info = ColumnInfo()
def copy(self, order='C', data=None, copy_data=True):
"""
Return a copy of the current instance.
If ``data`` is supplied then a view (reference) of ``data`` is used,
and ``copy_data`` is ignored.
Parameters
----------
order : {'C', 'F', 'A', 'K'}, optional
Controls the memory layout of the copy. 'C' means C-order,
'F' means F-order, 'A' means 'F' if ``a`` is Fortran contiguous,
'C' otherwise. 'K' means match the layout of ``a`` as closely
as possible. (Note that this function and :func:numpy.copy are very
similar, but have different default values for their order=
arguments.) Default is 'C'.
data : array, optional
If supplied then use a view of ``data`` instead of the instance
data. This allows copying the instance attributes and meta.
copy_data : bool, optional
Make a copy of the internal numpy array instead of using a
reference. Default is True.
Returns
-------
col : Column or MaskedColumn
Copy of the current column (same type as original)
"""
if data is None:
data = self.data
if copy_data:
data = data.copy(order)
out = data.view(self.__class__)
out.__array_finalize__(self)
# If there is meta on the original column then deepcopy (since "copy" of column
# implies complete independence from original). __array_finalize__ will have already
# made a light copy. I'm not sure how to avoid that initial light copy.
if self.meta is not None:
out.meta = self.meta # MetaData descriptor does a deepcopy here
# for MaskedColumn, MaskedArray.__array_finalize__ also copies mask
# from self, which is not the idea here, so undo
if isinstance(self, MaskedColumn):
out._mask = data._mask
self._copy_groups(out)
return out
def __setstate__(self, state):
"""
Restore the internal state of the Column/MaskedColumn for pickling
purposes. This requires that the last element of ``state`` is a
5-tuple that has Column-specific state values.
"""
# Get the Column attributes
names = ('_name', '_unit', '_format', 'description', 'meta', 'indices')
attrs = {name: val for name, val in zip(names, state[-1])}
state = state[:-1]
# Using super().__setstate__(state) gives
# "TypeError 'int' object is not iterable", raised in
# astropy.table._column_mixins._ColumnGetitemShim.__setstate_cython__()
# Previously, it seems to have given an infinite recursion.
# Hence, manually call the right super class to actually set up
# the array object.
super_class = ma.MaskedArray if isinstance(self, ma.MaskedArray) else np.ndarray
super_class.__setstate__(self, state)
# Set the Column attributes
for name, val in attrs.items():
setattr(self, name, val)
self._parent_table = None
def __reduce__(self):
"""
Return a 3-tuple for pickling a Column. Use the super-class
functionality but then add in a 5-tuple of Column-specific values
that get used in __setstate__.
"""
super_class = ma.MaskedArray if isinstance(self, ma.MaskedArray) else np.ndarray
reconstruct_func, reconstruct_func_args, state = super_class.__reduce__(self)
# Define Column-specific attrs and meta that gets added to state.
column_state = (self.name, self.unit, self.format, self.description,
self.meta, self.indices)
state = state + (column_state,)
return reconstruct_func, reconstruct_func_args, state
def __array_finalize__(self, obj):
# Obj will be none for direct call to Column() creator
if obj is None:
return
if callable(super().__array_finalize__):
super().__array_finalize__(obj)
# Self was created from template (e.g. obj[slice] or (obj * 2))
# or viewcast e.g. obj.view(Column). In either case we want to
# init Column attributes for self from obj if possible.
self.parent_table = None
if not hasattr(self, 'indices'): # may have been copied in __new__
self.indices = []
self._copy_attrs(obj)
def __array_wrap__(self, out_arr, context=None):
"""
__array_wrap__ is called at the end of every ufunc.
Normally, we want a Column object back and do not have to do anything
special. But there are two exceptions:
1) If the output shape is different (e.g. for reduction ufuncs
like sum() or mean()), a Column still linking to a parent_table
makes little sense, so we return the output viewed as the
column content (ndarray or MaskedArray).
For this case, we use "[()]" to select everything, and to ensure we
convert a zero rank array to a scalar. (For some reason np.sum()
returns a zero rank scalar array while np.mean() returns a scalar;
So the [()] is needed for this case.
2) When the output is created by any function that returns a boolean
we also want to consistently return an array rather than a column
(see #1446 and #1685)
"""
out_arr = super().__array_wrap__(out_arr, context)
if (self.shape != out_arr.shape
or (isinstance(out_arr, BaseColumn)
and (context is not None
and context[0] in _comparison_functions))):
return out_arr.data[()]
else:
return out_arr
@property
def name(self):
"""
The name of this column.
"""
return self._name
@name.setter
def name(self, val):
val = fix_column_name(val)
if self.parent_table is not None:
table = self.parent_table
table.columns._rename_column(self.name, val)
self._name = val
@property
def format(self):
"""
Format string for displaying values in this column.
"""
return self._format
@format.setter
def format(self, format_string):
prev_format = getattr(self, '_format', None)
self._format = format_string # set new format string
try:
# test whether it formats without error exemplarily
self.pformat(max_lines=1)
except Exception as err:
# revert to restore previous format if there was one
self._format = prev_format
raise ValueError(
"Invalid format for column '{}': could not display "
"values in this column using this format ({})".format(
self.name, err.args[0]))
@property
def descr(self):
"""Array-interface compliant full description of the column.
This returns a 3-tuple (name, type, shape) that can always be
used in a structured array dtype definition.
"""
return (self.name, self.dtype.str, self.shape[1:])
def iter_str_vals(self):
"""
Return an iterator that yields the string-formatted values of this
column.
Returns
-------
str_vals : iterator
Column values formatted as strings
"""
# Iterate over formatted values with no max number of lines, no column
# name, no unit, and ignoring the returned header info in outs.
_pformat_col_iter = self._formatter._pformat_col_iter
for str_val in _pformat_col_iter(self, -1, show_name=False, show_unit=False,
show_dtype=False, outs={}):
yield str_val
def attrs_equal(self, col):
"""Compare the column attributes of ``col`` to this object.
The comparison attributes are: ``name``, ``unit``, ``dtype``,
``format``, ``description``, and ``meta``.
Parameters
----------
col : Column
Comparison column
Returns
-------
equal : bool
True if all attributes are equal
"""
if not isinstance(col, BaseColumn):
raise ValueError('Comparison `col` must be a Column or '
'MaskedColumn object')
attrs = ('name', 'unit', 'dtype', 'format', 'description', 'meta')
equal = all(getattr(self, x) == getattr(col, x) for x in attrs)
return equal
@property
def _formatter(self):
return FORMATTER if (self.parent_table is None) else self.parent_table.formatter
def pformat(self, max_lines=None, show_name=True, show_unit=False, show_dtype=False,
html=False):
"""Return a list of formatted string representation of column values.
If no value of ``max_lines`` is supplied then the height of the
screen terminal is used to set ``max_lines``. If the terminal
height cannot be determined then the default will be
determined using the ``astropy.conf.max_lines`` configuration
item. If a negative value of ``max_lines`` is supplied then
there is no line limit applied.
Parameters
----------
max_lines : int
Maximum lines of output (header + data rows)
show_name : bool
Include column name. Default is True.
show_unit : bool
Include a header row for unit. Default is False.
show_dtype : bool
Include column dtype. Default is False.
html : bool
Format the output as an HTML table. Default is False.
Returns
-------
lines : list
List of lines with header and formatted column values
"""
_pformat_col = self._formatter._pformat_col
lines, outs = _pformat_col(self, max_lines, show_name=show_name,
show_unit=show_unit, show_dtype=show_dtype,
html=html)
return lines
def pprint(self, max_lines=None, show_name=True, show_unit=False, show_dtype=False):
"""Print a formatted string representation of column values.
If no value of ``max_lines`` is supplied then the height of the
screen terminal is used to set ``max_lines``. If the terminal
height cannot be determined then the default will be
determined using the ``astropy.conf.max_lines`` configuration
item. If a negative value of ``max_lines`` is supplied then
there is no line limit applied.
Parameters
----------
max_lines : int
Maximum number of values in output
show_name : bool
Include column name. Default is True.
show_unit : bool
Include a header row for unit. Default is False.
show_dtype : bool
Include column dtype. Default is True.
"""
_pformat_col = self._formatter._pformat_col
lines, outs = _pformat_col(self, max_lines, show_name=show_name, show_unit=show_unit,
show_dtype=show_dtype)
n_header = outs['n_header']
for i, line in enumerate(lines):
if i < n_header:
color_print(line, 'red')
else:
print(line)
def more(self, max_lines=None, show_name=True, show_unit=False):
"""Interactively browse column with a paging interface.
Supported keys::
f, <space> : forward one page
b : back one page
r : refresh same page
n : next row
p : previous row
< : go to beginning
> : go to end
q : quit browsing
h : print this help
Parameters
----------
max_lines : int
Maximum number of lines in table output.
show_name : bool
Include a header row for column names. Default is True.
show_unit : bool
Include a header row for unit. Default is False.
"""
_more_tabcol = self._formatter._more_tabcol
_more_tabcol(self, max_lines=max_lines, show_name=show_name,
show_unit=show_unit)
@property
def unit(self):
"""
The unit associated with this column. May be a string or a
`astropy.units.UnitBase` instance.
Setting the ``unit`` property does not change the values of the
data. To perform a unit conversion, use ``convert_unit_to``.
"""
return self._unit
@unit.setter
def unit(self, unit):
if unit is None:
self._unit = None
else:
self._unit = Unit(unit, parse_strict='silent')
@unit.deleter
def unit(self):
self._unit = None
def convert_unit_to(self, new_unit, equivalencies=[]):
"""
Converts the values of the column in-place from the current
unit to the given unit.
To change the unit associated with this column without
actually changing the data values, simply set the ``unit``
property.
Parameters
----------
new_unit : str or `astropy.units.UnitBase` instance
The unit to convert to.
equivalencies : list of equivalence pairs, optional
A list of equivalence pairs to try if the unit are not
directly convertible. See :ref:`unit_equivalencies`.
Raises
------
astropy.units.UnitsError
If units are inconsistent
"""
if self.unit is None:
raise ValueError("No unit set on column")
self.data[:] = self.unit.to(
new_unit, self.data, equivalencies=equivalencies)
self.unit = new_unit
@property
def groups(self):
if not hasattr(self, '_groups'):
self._groups = groups.ColumnGroups(self)
return self._groups
def group_by(self, keys):
"""
Group this column by the specified ``keys``
This effectively splits the column into groups which correspond to
unique values of the ``keys`` grouping object. The output is a new
`Column` or `MaskedColumn` which contains a copy of this column but
sorted by row according to ``keys``.
The ``keys`` input to ``group_by`` must be a numpy array with the
same length as this column.
Parameters
----------
keys : numpy array
Key grouping object
Returns
-------
out : Column
New column with groups attribute set accordingly
"""
return groups.column_group_by(self, keys)
def _copy_groups(self, out):
"""
Copy current groups into a copy of self ``out``
"""
if self.parent_table:
if hasattr(self.parent_table, '_groups'):
out._groups = groups.ColumnGroups(out, indices=self.parent_table._groups._indices)
elif hasattr(self, '_groups'):
out._groups = groups.ColumnGroups(out, indices=self._groups._indices)
# Strip off the BaseColumn-ness for repr and str so that
# MaskedColumn.data __repr__ does not include masked_BaseColumn(data =
# [1 2], ...).
def __repr__(self):
return np.asarray(self).__repr__()
@property
def quantity(self):
"""
A view of this table column as a `~astropy.units.Quantity` object with
units given by the Column's `unit` parameter.
"""
# the Quantity initializer is used here because it correctly fails
# if the column's values are non-numeric (like strings), while .view
# will happily return a quantity with gibberish for numerical values
return Quantity(self, self.unit, copy=False, dtype=self.dtype, order='A', subok=True)
def to(self, unit, equivalencies=[], **kwargs):
"""
Converts this table column to a `~astropy.units.Quantity` object with
the requested units.
Parameters
----------
unit : `~astropy.units.Unit` or str
The unit to convert to (i.e., a valid argument to the
:meth:`astropy.units.Quantity.to` method).
equivalencies : list of equivalence pairs, optional
Equivalencies to use for this conversion. See
:meth:`astropy.units.Quantity.to` for more details.
Returns
-------
quantity : `~astropy.units.Quantity`
A quantity object with the contents of this column in the units
``unit``.
"""
return self.quantity.to(unit, equivalencies)
def _copy_attrs(self, obj):
"""
Copy key column attributes from ``obj`` to self
"""
for attr in ('name', 'unit', '_format', 'description'):
val = getattr(obj, attr, None)
setattr(self, attr, val)
# Light copy of meta if it is not empty
obj_meta = getattr(obj, 'meta', None)
if obj_meta:
self.meta = obj_meta.copy()
@staticmethod
def _encode_str(value):
"""
Encode anything that is unicode-ish as utf-8. This method is only
called for Py3+.
"""
if isinstance(value, str):
value = value.encode('utf-8')
elif isinstance(value, bytes) or value is np.ma.masked:
pass
else:
arr = np.asarray(value)
if arr.dtype.char == 'U':
arr = np.char.encode(arr, encoding='utf-8')
if isinstance(value, np.ma.MaskedArray):
arr = np.ma.array(arr, mask=value.mask, copy=False)
value = arr
return value
def tolist(self):
if self.dtype.kind == 'S':
return np.chararray.decode(self, encoding='utf-8').tolist()
else:
return super().tolist()
[docs]class Column(BaseColumn):
"""Define a data column for use in a Table object.
Parameters
----------
data : list, ndarray or None
Column data values
name : str
Column name and key for reference within Table
dtype : numpy.dtype compatible value
Data type for column
shape : tuple or ()
Dimensions of a single row element in the column data
length : int or 0
Number of row elements in column data
description : str or None
Full description of column
unit : str or None
Physical unit
format : str or None or function or callable
Format string for outputting column values. This can be an
"old-style" (``format % value``) or "new-style" (`str.format`)
format specification string or a function or any callable object that
accepts a single value and returns a string.
meta : dict-like or None
Meta-data associated with the column
Examples
--------
A Column can be created in two different ways:
- Provide a ``data`` value but not ``shape`` or ``length`` (which are
inferred from the data).
Examples::
col = Column(data=[1, 2], name='name') # shape=(2,)
col = Column(data=[[1, 2], [3, 4]], name='name') # shape=(2, 2)
col = Column(data=[1, 2], name='name', dtype=float)
col = Column(data=np.array([1, 2]), name='name')
col = Column(data=['hello', 'world'], name='name')
The ``dtype`` argument can be any value which is an acceptable
fixed-size data-type initializer for the numpy.dtype() method. See
`<https://docs.scipy.org/doc/numpy/reference/arrays.dtypes.html>`_.
Examples include:
- Python non-string type (float, int, bool)
- Numpy non-string type (e.g. np.float32, np.int64, np.bool\\_)
- Numpy.dtype array-protocol type strings (e.g. 'i4', 'f8', 'S15')
If no ``dtype`` value is provide then the type is inferred using
``np.array(data)``.
- Provide ``length`` and optionally ``shape``, but not ``data``
Examples::
col = Column(name='name', length=5)
col = Column(name='name', dtype=int, length=10, shape=(3,4))
The default ``dtype`` is ``np.float64``. The ``shape`` argument is the
array shape of a single cell in the column.
"""
def __new__(cls, data=None, name=None,
dtype=None, shape=(), length=0,
description=None, unit=None, format=None, meta=None,
copy=False, copy_indices=True):
if isinstance(data, MaskedColumn) and np.any(data.mask):
raise TypeError("Cannot convert a MaskedColumn with masked value to a Column")
self = super().__new__(
cls, data=data, name=name, dtype=dtype, shape=shape, length=length,
description=description, unit=unit, format=format, meta=meta,
copy=copy, copy_indices=copy_indices)
return self
def __setattr__(self, item, value):
if not isinstance(self, MaskedColumn) and item == "mask":
raise AttributeError("cannot set mask value to a column in non-masked Table")
super().__setattr__(item, value)
if item == 'unit' and issubclass(self.dtype.type, np.number):
try:
converted = self.parent_table._convert_col_for_table(self)
except AttributeError: # Either no parent table or parent table is None
pass
else:
if converted is not self:
self.parent_table.replace_column(self.name, converted)
def _base_repr_(self, html=False):
# If scalar then just convert to correct numpy type and use numpy repr
if self.ndim == 0:
return repr(self.item())
descr_vals = [self.__class__.__name__]
unit = None if self.unit is None else str(self.unit)
shape = None if self.ndim <= 1 else self.shape[1:]
for attr, val in (('name', self.name),
('dtype', dtype_info_name(self.dtype)),
('shape', shape),
('unit', unit),
('format', self.format),
('description', self.description),
('length', len(self))):
if val is not None:
descr_vals.append(f'{attr}={val!r}')
descr = '<' + ' '.join(descr_vals) + '>\n'
if html:
from astropy.utils.xml.writer import xml_escape
descr = xml_escape(descr)
data_lines, outs = self._formatter._pformat_col(
self, show_name=False, show_unit=False, show_length=False, html=html)
out = descr + '\n'.join(data_lines)
return out
def _repr_html_(self):
return self._base_repr_(html=True)
def __repr__(self):
return self._base_repr_(html=False)
def __str__(self):
# If scalar then just convert to correct numpy type and use numpy repr
if self.ndim == 0:
return str(self.item())
lines, outs = self._formatter._pformat_col(self)
return '\n'.join(lines)
def __bytes__(self):
return str(self).encode('utf-8')
def _check_string_truncate(self, value):
"""
Emit a warning if any elements of ``value`` will be truncated when
``value`` is assigned to self.
"""
# Convert input ``value`` to the string dtype of this column and
# find the length of the longest string in the array.
value = np.asanyarray(value, dtype=self.dtype.type)
if value.size == 0:
return
value_str_len = np.char.str_len(value).max()
# Parse the array-protocol typestring (e.g. '|U15') of self.dtype which
# has the character repeat count on the right side.
self_str_len = dtype_bytes_or_chars(self.dtype)
if value_str_len > self_str_len:
warnings.warn('truncated right side string(s) longer than {} '
'character(s) during assignment'
.format(self_str_len),
StringTruncateWarning,
stacklevel=3)
def __setitem__(self, index, value):
if self.dtype.char == 'S':
value = self._encode_str(value)
# Issue warning for string assignment that truncates ``value``
if issubclass(self.dtype.type, np.character):
self._check_string_truncate(value)
# update indices
self.info.adjust_indices(index, value, len(self))
# Set items using a view of the underlying data, as it gives an
# order-of-magnitude speed-up. [#2994]
self.data[index] = value
def _make_compare(oper):
"""
Make comparison methods which encode the ``other`` object to utf-8
in the case of a bytestring dtype for Py3+.
"""
swapped_oper = {'__eq__': '__eq__',
'__ne__': '__ne__',
'__gt__': '__lt__',
'__lt__': '__gt__',
'__ge__': '__le__',
'__le__': '__ge__'}[oper]
def _compare(self, other):
op = oper # copy enclosed ref to allow swap below
# Special case to work around #6838. Other combinations work OK,
# see tests.test_column.test_unicode_sandwich_compare(). In this
# case just swap self and other.
#
# This is related to an issue in numpy that was addressed in np 1.13.
# However that fix does not make this problem go away, but maybe
# future numpy versions will do so. NUMPY_LT_1_13 to get the
# attention of future maintainers to check (by deleting or versioning
# the if block below). See #6899 discussion.
# 2019-06-21: still needed with numpy 1.16.
if (isinstance(self, MaskedColumn) and self.dtype.kind == 'U'
and isinstance(other, MaskedColumn) and other.dtype.kind == 'S'):
self, other = other, self
op = swapped_oper
if self.dtype.char == 'S':
other = self._encode_str(other)
# Now just let the regular ndarray.__eq__, etc., take over.
result = getattr(super(), op)(other)
# But we should not return Column instances for this case.
return result.data if isinstance(result, Column) else result
return _compare
__eq__ = _make_compare('__eq__')
__ne__ = _make_compare('__ne__')
__gt__ = _make_compare('__gt__')
__lt__ = _make_compare('__lt__')
__ge__ = _make_compare('__ge__')
__le__ = _make_compare('__le__')
[docs] def insert(self, obj, values, axis=0):
"""
Insert values before the given indices in the column and return
a new `~astropy.table.Column` object.
Parameters
----------
obj : int, slice or sequence of ints
Object that defines the index or indices before which ``values`` is
inserted.
values : array_like
Value(s) to insert. If the type of ``values`` is different from
that of the column, ``values`` is converted to the matching type.
``values`` should be shaped so that it can be broadcast appropriately.
axis : int, optional
Axis along which to insert ``values``. If ``axis`` is None then
the column array is flattened before insertion. Default is 0,
which will insert a row.
Returns
-------
out : `~astropy.table.Column`
A copy of column with ``values`` and ``mask`` inserted. Note that the
insertion does not occur in-place: a new column is returned.
"""
if self.dtype.kind == 'O':
# Even if values is array-like (e.g. [1,2,3]), insert as a single
# object. Numpy.insert instead inserts each element in an array-like
# input individually.
data = np.insert(self, obj, None, axis=axis)
data[obj] = values
else:
self_for_insert = _expand_string_array_for_values(self, values)
data = np.insert(self_for_insert, obj, values, axis=axis)
out = data.view(self.__class__)
out.__array_finalize__(self)
return out
# We do this to make the methods show up in the API docs
name = BaseColumn.name
unit = BaseColumn.unit
copy = BaseColumn.copy
more = BaseColumn.more
pprint = BaseColumn.pprint
pformat = BaseColumn.pformat
convert_unit_to = BaseColumn.convert_unit_to
quantity = BaseColumn.quantity
to = BaseColumn.to
class MaskedColumnInfo(ColumnInfo):
"""
Container for meta information like name, description, format.
This is required when the object is used as a mixin column within a table,
but can be used as a general way to store meta information. In this case
it just adds the ``mask_val`` attribute.
"""
# Add `serialize_method` attribute to the attrs that MaskedColumnInfo knows
# about. This allows customization of the way that MaskedColumn objects
# get written to file depending on format. The default is to use whatever
# the writer would normally do, which in the case of FITS or ECSV is to use
# a NULL value within the data itself. If serialize_method is 'data_mask'
# then the mask is explicitly written out as a separate column if there
# are any masked values. See also code below.
attr_names = ColumnInfo.attr_names | {'serialize_method'}
# When `serialize_method` is 'data_mask', and data and mask are being written
# as separate columns, use column names <name> and <name>.mask (instead
# of default encoding as <name>.data and <name>.mask).
_represent_as_dict_primary_data = 'data'
mask_val = np.ma.masked
def __init__(self, bound=False):
super().__init__(bound)
# If bound to a data object instance then create the dict of attributes
# which stores the info attribute values.
if bound:
# Specify how to serialize this object depending on context.
self.serialize_method = {'fits': 'null_value',
'ecsv': 'null_value',
'hdf5': 'data_mask',
None: 'null_value'}
def _represent_as_dict(self):
out = super()._represent_as_dict()
col = self._parent
# If the serialize method for this context (e.g. 'fits' or 'ecsv') is
# 'data_mask', that means to serialize using an explicit mask column.
method = self.serialize_method[self._serialize_context]
if method == 'data_mask':
# Note: a driver here is a performance issue in #8443 where repr() of a
# np.ma.MaskedArray value is up to 10 times slower than repr of a normal array
# value. So regardless of whether there are masked elements it is useful to
# explicitly define this as a serialized column and use col.data.data (ndarray)
# instead of letting it fall through to the "standard" serialization machinery.
out['data'] = col.data.data
if np.any(col.mask):
# Only if there are actually masked elements do we add the ``mask`` column
out['mask'] = col.mask
elif method == 'null_value':
pass
else:
raise ValueError('serialize method must be either "data_mask" or "null_value"')
return out
[docs]class MaskedColumn(Column, _MaskedColumnGetitemShim, ma.MaskedArray):
"""Define a masked data column for use in a Table object.
Parameters
----------
data : list, ndarray or None
Column data values
name : str
Column name and key for reference within Table
mask : list, ndarray or None
Boolean mask for which True indicates missing or invalid data
fill_value : float, int, str or None
Value used when filling masked column elements
dtype : numpy.dtype compatible value
Data type for column
shape : tuple or ()
Dimensions of a single row element in the column data
length : int or 0
Number of row elements in column data
description : str or None
Full description of column
unit : str or None
Physical unit
format : str or None or function or callable
Format string for outputting column values. This can be an
"old-style" (``format % value``) or "new-style" (`str.format`)
format specification string or a function or any callable object that
accepts a single value and returns a string.
meta : dict-like or None
Meta-data associated with the column
Examples
--------
A MaskedColumn is similar to a Column except that it includes ``mask`` and
``fill_value`` attributes. It can be created in two different ways:
- Provide a ``data`` value but not ``shape`` or ``length`` (which are
inferred from the data).
Examples::
col = MaskedColumn(data=[1, 2], name='name')
col = MaskedColumn(data=[1, 2], name='name', mask=[True, False])
col = MaskedColumn(data=[1, 2], name='name', dtype=float, fill_value=99)
The ``mask`` argument will be cast as a boolean array and specifies
which elements are considered to be missing or invalid.
The ``dtype`` argument can be any value which is an acceptable
fixed-size data-type initializer for the numpy.dtype() method. See
`<https://docs.scipy.org/doc/numpy/reference/arrays.dtypes.html>`_.
Examples include:
- Python non-string type (float, int, bool)
- Numpy non-string type (e.g. np.float32, np.int64, np.bool\\_)
- Numpy.dtype array-protocol type strings (e.g. 'i4', 'f8', 'S15')
If no ``dtype`` value is provide then the type is inferred using
``np.array(data)``. When ``data`` is provided then the ``shape``
and ``length`` arguments are ignored.
- Provide ``length`` and optionally ``shape``, but not ``data``
Examples::
col = MaskedColumn(name='name', length=5)
col = MaskedColumn(name='name', dtype=int, length=10, shape=(3,4))
The default ``dtype`` is ``np.float64``. The ``shape`` argument is the
array shape of a single cell in the column.
"""
info = MaskedColumnInfo()
def __new__(cls, data=None, name=None, mask=None, fill_value=None,
dtype=None, shape=(), length=0,
description=None, unit=None, format=None, meta=None,
copy=False, copy_indices=True):
if mask is None:
# If mask is None then we need to determine the mask (if any) from the data.
# The naive method is looking for a mask attribute on data, but this can fail,
# see #8816. Instead use ``MaskedArray`` to do the work.
mask = ma.MaskedArray(data).mask
if mask is np.ma.nomask:
# Handle odd-ball issue with np.ma.nomask (numpy #13758), and see below.
mask = False
elif copy:
mask = mask.copy()
elif mask is np.ma.nomask:
# Force the creation of a full mask array as nomask is tricky to
# use and will fail in an unexpected manner when setting a value
# to the mask.
mask = False
else:
mask = deepcopy(mask)
# Create self using MaskedArray as a wrapper class, following the example of
# class MSubArray in
# https://github.com/numpy/numpy/blob/maintenance/1.8.x/numpy/ma/tests/test_subclassing.py
# This pattern makes it so that __array_finalize__ is called as expected (e.g. #1471 and
# https://github.com/astropy/astropy/commit/ff6039e8)
# First just pass through all args and kwargs to BaseColumn, then wrap that object
# with MaskedArray.
self_data = BaseColumn(data, dtype=dtype, shape=shape, length=length, name=name,
unit=unit, format=format, description=description,
meta=meta, copy=copy, copy_indices=copy_indices)
self = ma.MaskedArray.__new__(cls, data=self_data, mask=mask)
# Note: do not set fill_value in the MaskedArray constructor because this does not
# go through the fill_value workarounds.
if fill_value is None and getattr(data, 'fill_value', None) is not None:
# Coerce the fill_value to the correct type since `data` may be a
# different dtype than self.
fill_value = np.array(data.fill_value, self.dtype)[()]
self.fill_value = fill_value
self.parent_table = None
# needs to be done here since self doesn't come from BaseColumn.__new__
for index in self.indices:
index.replace_col(self_data, self)
return self
@property
def fill_value(self):
return self.get_fill_value() # defer to native ma.MaskedArray method
@fill_value.setter
def fill_value(self, val):
"""Set fill value both in the masked column view and in the parent table
if it exists. Setting one or the other alone doesn't work."""
# another ma bug workaround: If the value of fill_value for a string array is
# requested but not yet set then it gets created as 'N/A'. From this point onward
# any new fill_values are truncated to 3 characters. Note that this does not
# occur if the masked array is a structured array (as in the previous block that
# deals with the parent table).
#
# >>> x = ma.array(['xxxx'])
# >>> x.fill_value # fill_value now gets represented as an 'S3' array
# 'N/A'
# >>> x.fill_value='yyyy'
# >>> x.fill_value
# 'yyy'
#
# To handle this we are forced to reset a private variable first:
self._fill_value = None
self.set_fill_value(val) # defer to native ma.MaskedArray method
@property
def data(self):
"""The plain MaskedArray data held by this column."""
out = self.view(np.ma.MaskedArray)
# By default, a MaskedArray view will set the _baseclass to be the
# same as that of our own class, i.e., BaseColumn. Since we want
# to return a plain MaskedArray, we reset the baseclass accordingly.
out._baseclass = np.ndarray
return out
[docs] def filled(self, fill_value=None):
"""Return a copy of self, with masked values filled with a given value.
Parameters
----------
fill_value : scalar; optional
The value to use for invalid entries (`None` by default). If
`None`, the ``fill_value`` attribute of the array is used
instead.
Returns
-------
filled_column : Column
A copy of ``self`` with masked entries replaced by `fill_value`
(be it the function argument or the attribute of ``self``).
"""
if fill_value is None:
fill_value = self.fill_value
data = super().filled(fill_value)
# Use parent table definition of Column if available
column_cls = self.parent_table.Column if (self.parent_table is not None) else Column
out = column_cls(name=self.name, data=data, unit=self.unit,
format=self.format, description=self.description,
meta=deepcopy(self.meta))
return out
[docs] def insert(self, obj, values, mask=None, axis=0):
"""
Insert values along the given axis before the given indices and return
a new `~astropy.table.MaskedColumn` object.
Parameters
----------
obj : int, slice or sequence of ints
Object that defines the index or indices before which ``values`` is
inserted.
values : array_like
Value(s) to insert. If the type of ``values`` is different from
that of the column, ``values`` is converted to the matching type.
``values`` should be shaped so that it can be broadcast appropriately.
mask : bool or array_like
Mask value(s) to insert. If not supplied, and values does not have
a mask either, then False is used.
axis : int, optional
Axis along which to insert ``values``. If ``axis`` is None then
the column array is flattened before insertion. Default is 0,
which will insert a row.
Returns
-------
out : `~astropy.table.MaskedColumn`
A copy of column with ``values`` and ``mask`` inserted. Note that the
insertion does not occur in-place: a new masked column is returned.
"""
self_ma = self.data # self viewed as MaskedArray
if self.dtype.kind == 'O':
# Even if values is array-like (e.g. [1,2,3]), insert as a single
# object. Numpy.insert instead inserts each element in an array-like
# input individually.
new_data = np.insert(self_ma.data, obj, None, axis=axis)
new_data[obj] = values
else:
self_ma = _expand_string_array_for_values(self_ma, values)
new_data = np.insert(self_ma.data, obj, values, axis=axis)
if mask is None:
mask = getattr(values, 'mask', np.ma.nomask)
if mask is np.ma.nomask:
if self.dtype.kind == 'O':
mask = False
else:
mask = np.zeros(np.shape(values), dtype=bool)
new_mask = np.insert(self_ma.mask, obj, mask, axis=axis)
new_ma = np.ma.array(new_data, mask=new_mask, copy=False)
out = new_ma.view(self.__class__)
out.parent_table = None
out.indices = []
out._copy_attrs(self)
out.fill_value = self.fill_value
return out
def _copy_attrs_slice(self, out):
# Fixes issue #3023: when calling getitem with a MaskedArray subclass
# the original object attributes are not copied.
if out.__class__ is self.__class__:
out.parent_table = None
# we need this because __getitem__ does a shallow copy of indices
if out.indices is self.indices:
out.indices = []
out._copy_attrs(self)
return out
def __setitem__(self, index, value):
# Issue warning for string assignment that truncates ``value``
if self.dtype.char == 'S':
value = self._encode_str(value)
if issubclass(self.dtype.type, np.character):
# Account for a bug in np.ma.MaskedArray setitem.
# https://github.com/numpy/numpy/issues/8624
value = np.ma.asanyarray(value, dtype=self.dtype.type)
# Check for string truncation after filling masked items with
# empty (zero-length) string. Note that filled() does not make
# a copy if there are no masked items.
self._check_string_truncate(value.filled(''))
# update indices
self.info.adjust_indices(index, value, len(self))
ma.MaskedArray.__setitem__(self, index, value)
# We do this to make the methods show up in the API docs
name = BaseColumn.name
copy = BaseColumn.copy
more = BaseColumn.more
pprint = BaseColumn.pprint
pformat = BaseColumn.pformat
convert_unit_to = BaseColumn.convert_unit_to