Am trying to get the top most rating using groupby of multiple columns and if there is no combination of that particular groupby, its throwing me an error . how to do multiple c
You can use *args for dynamic input, (ordering of values cannot be changed) with query for filtering:
def get_top(*args):
c = ['maritalstatus', 'gender', 'age_range', 'occ']
m = (df.groupby(c)['rating'].apply(lambda x: x.value_counts().index[0])
args = list(args)
while True:
d = dict(zip(c, args))
q = ' & '.join((('({} == "{}")').format(i, j)) for i, j in d.items())
m1 = m.query(q)['rating']
if m1.empty and len(args) > 1:
return m1
print(get_top('ma', 'M', 'young','teacher'))
1 PG
Name: rating, dtype: object
pandas is definitely the goto library for handling detailed tabular data. For those seeking a non-pandas
option, you can build your own mapping and reduction functions. I use these terms to mean the following:
analogous groupby/aggregation concepts.
Cleaned data where multiple spaces have been replaced with a single delimiter, e.g. ","
%%file "test.txt"
import csv
import collections as ct
Step 1: Read data
def read_file(fname):
with open(fname, "r") as f:
reader = csv.DictReader(f)
for line in reader:
yield line
iterable = [line for line in read_file("test.txt")]
[OrderedDict([('status', 'ma'),
('gender', 'M'),
('age_range', 'young'),
('occ', 'student'),
('rating', 'PG')]),
OrderedDict([('status', 'ma'),
('gender', 'F'),
('age_range', 'adult'),
Step 2: Remap data
def mapping(data, column):
"""Return a dict of regrouped data."""
dd = ct.defaultdict(list)
for d in data:
key = d[column]
value = {k: v for k, v in d.items() if k != column}
return dict(dd)
mapping(iterable, "gender")
{'M': [
{'age_range': 'young', 'occ': 'student', 'rating': 'PG', ...},
'F': [
{'status': 'ma', 'age_range': 'adult', ...},
Step 3: Reduce data
def reduction(data):
"""Return a reduced mapping of Counters."""
final = {}
for key, val in data.items():
agg = ct.defaultdict(ct.Counter)
for d in val:
for k, v in d.items():
agg[k][v] += 1
final[key] = dict(agg)
return final
reduction(mapping(iterable, "gender"))
{'F': {
'age_range': Counter({'adult': 2}),
'occ': Counter({'teacher': 2}),
'rating': Counter({'R': 2}),
'status': Counter({'ma': 1, 'sin': 1})},
'M': {
'age_range': Counter({'adult': 1, 'young': 3}),
'occ': Counter({'student': 3, 'teacher': 1}),
'rating': Counter({'PG': 3, 'R': 1}),
'status': Counter({'ma': 2, 'sin': 2})}
With these tools in place, you can build a data pipeline and to query the data, feeding results from one function into another:
# Find the top age range amoung males
pipeline = reduction(mapping(iterable, "gender"))
# [('young', 3)]
# Find the top ratings among teachers
pipeline = reduction(mapping(iterable, "occ"))
# [('R', 3)]
# Find the number of married people
pipeline = reduction(mapping(iterable, "gender"))
sum(v["status"]["ma"] for k, v in pipeline.items())
# 3
Overall, you tailor your output based on how you define your reduction function.
Note, the code from this generalized process is more verbose than a former example despite its powerful application to many data columns. pandas
succinctly encapsulates these concepts. Although the learning curve may initially be more steep, it can greatly expedite data analysis.
, "F"
. defaultdict
initialize a Counter
and repeated entries simply tally observations.Application
Pipelines are optional. Here we will build a single function that processes serial requests:
def serial_reduction(iterable, val_queries):
"""Return a `Counter` that is reduced after serial queries."""
q1, *qs = val_queries
val_to_key = {v:k for k, v in iterable[0].items()}
values_list = mapping(iterable, val_to_key[q1])[q1]
counter = ct.Counter()
# Process queries for dicts in each row and build a counter
for q in qs:
for row in values_list[:]:
if val_to_key[q] not in row:
reduced_vals = {v for v in row.values() if v not in qs}
for val in reduced_vals:
counter[val] += 1
except KeyError:
raise ValueError("'{}' not found. Try a new query.".format(q))
return counter
c = serial_reduction(iterable, "ma M young".split())
# [('student', 2), ('PG', 2)]
serial_reduction(iterable, "ma M young teacher".split())
# ValueError: 'teacher' not found. Try a new query.
This is one non-pandas solution. Counter.most_common()
orders results by most common descending counts.
from collections import Counter
def get_top(maritalstatus=None, gender=None, age_range=None, occ=None):
cols = ['maritalstatus', 'gender', 'age_range', 'occ']
values = [maritalstatus, gender, age_range, occ]
c = Counter(df.query(' & '.join((('({0} == "{1}")').format(i, j)) \
for i, j in zip(cols, values) if j))['rating'])
return c.most_common()
get_top(maritalstatus='ma', gender='M', age_range='young') # [('PG', 2)]