According to this post, I should be able to access the names of columns in an ndarray as a.dtype.names
Howevever, if I convert a pandas DataFrame to an ndarray with df.a
OK, here where I'm leaning:
class NDArrayWithColumns(np.ndarray):
def __new__(cls, obj, columns=None):
obj = obj.view(cls)
obj.columns = columns
return obj
def __array_finalize__(self, obj):
if obj is None: return
self.columns = getattr(obj, 'columns', None)
def from_dataframe(df):
cols = tuple(df.columns)
arr = df.as_matrix(cols)
return NDArrayWithColumns.from_array(arr,cols)
def from_array(array,columns):
if isinstance(array,NDArrayWithColumns):
return array
return NDArrayWithColumns(array,tuple(columns))
def __str__(self):
sup = np.ndarray.__str__(self)
if self.columns:
header = ", ".join(self.columns)
header = "# " + header + "\n"
return header+sup
return sup
NAN = float("nan")
X = pd.DataFrame(dict(age=[40., NAN, 60.], sys_blood_pressure=[140.,150.,160.]))
arr = NDArrayWithColumns.from_dataframe(X)
print arr
print arr.columns
print arr.dtype
# age, sys_blood_pressure
[[ 40. 140.]
[ nan 150.]
[ 60. 160.]]
('age', 'sys_blood_pressure')
and can also be passed to types cython function expecting a ndarray[2,double_t].
UPDATE: this works pretty good except for some oddness when passing the type to ufuncs.