Am trying to get the top most rating using groupby of multiple columns and if there is no combination of that particular groupby, its throwing me an error . how to do multiple c
You can use *args for dynamic input, (ordering of values cannot be changed) with query for filtering:
def get_top(*args):
c = ['maritalstatus', 'gender', 'age_range', 'occ']
m = (df.groupby(c)['rating'].apply(lambda x: x.value_counts().index[0])
.reset_index())
args = list(args)
while True:
d = dict(zip(c, args))
#https://stackoverflow.com/a/48371587/2901002
q = ' & '.join((('({} == "{}")').format(i, j)) for i, j in d.items())
m1 = m.query(q)['rating']
if m1.empty and len(args) > 1:
args.pop()
else:
return m1
print(get_top('ma', 'M', 'young','teacher'))
1 PG
Name: rating, dtype: object
pandas is definitely the goto library for handling detailed tabular data. For those seeking a non-pandas
option, you can build your own mapping and reduction functions. I use these terms to mean the following:
pandas
analogous groupby/aggregation concepts.
Given
Cleaned data where multiple spaces have been replaced with a single delimiter, e.g. ","
.
%%file "test.txt"
status,gender,age_range,occ,rating
ma,M,young,student,PG
ma,F,adult,teacher,R
sin,M,young,student,PG
sin,M,adult,teacher,R
ma,M,young,student,PG
sin,F,adult,teacher,R
Code
import csv
import collections as ct
Step 1: Read data
def read_file(fname):
with open(fname, "r") as f:
reader = csv.DictReader(f)
for line in reader:
yield line
iterable = [line for line in read_file("test.txt")]
iterable
Output
[OrderedDict([('status', 'ma'),
('gender', 'M'),
('age_range', 'young'),
('occ', 'student'),
('rating', 'PG')]),
OrderedDict([('status', 'ma'),
('gender', 'F'),
('age_range', 'adult'),
...]
...
]
Step 2: Remap data
def mapping(data, column):
"""Return a dict of regrouped data."""
dd = ct.defaultdict(list)
for d in data:
key = d[column]
value = {k: v for k, v in d.items() if k != column}
dd[key].append(value)
return dict(dd)
mapping(iterable, "gender")
Output
{'M': [
{'age_range': 'young', 'occ': 'student', 'rating': 'PG', ...},
...]
'F': [
{'status': 'ma', 'age_range': 'adult', ...},
...]
}
Step 3: Reduce data
def reduction(data):
"""Return a reduced mapping of Counters."""
final = {}
for key, val in data.items():
agg = ct.defaultdict(ct.Counter)
for d in val:
for k, v in d.items():
agg[k][v] += 1
final[key] = dict(agg)
return final
reduction(mapping(iterable, "gender"))
Output
{'F': {
'age_range': Counter({'adult': 2}),
'occ': Counter({'teacher': 2}),
'rating': Counter({'R': 2}),
'status': Counter({'ma': 1, 'sin': 1})},
'M': {
'age_range': Counter({'adult': 1, 'young': 3}),
'occ': Counter({'student': 3, 'teacher': 1}),
'rating': Counter({'PG': 3, 'R': 1}),
'status': Counter({'ma': 2, 'sin': 2})}
}
Demo
With these tools in place, you can build a data pipeline and to query the data, feeding results from one function into another:
# Find the top age range amoung males
pipeline = reduction(mapping(iterable, "gender"))
pipeline["M"]["age_range"].most_common(1)
# [('young', 3)]
# Find the top ratings among teachers
pipeline = reduction(mapping(iterable, "occ"))
pipeline["teacher"]["rating"].most_common()
# [('R', 3)]
# Find the number of married people
pipeline = reduction(mapping(iterable, "gender"))
sum(v["status"]["ma"] for k, v in pipeline.items())
# 3
Overall, you tailor your output based on how you define your reduction function.
Note, the code from this generalized process is more verbose than a former example despite its powerful application to many data columns. pandas
succinctly encapsulates these concepts. Although the learning curve may initially be more steep, it can greatly expedite data analysis.
Details
"M"
, "F"
. defaultdict
initialize a Counter
and repeated entries simply tally observations.Application
Pipelines are optional. Here we will build a single function that processes serial requests:
def serial_reduction(iterable, val_queries):
"""Return a `Counter` that is reduced after serial queries."""
q1, *qs = val_queries
val_to_key = {v:k for k, v in iterable[0].items()}
values_list = mapping(iterable, val_to_key[q1])[q1]
counter = ct.Counter()
# Process queries for dicts in each row and build a counter
for q in qs:
try:
for row in values_list[:]:
if val_to_key[q] not in row:
continue
else:
reduced_vals = {v for v in row.values() if v not in qs}
for val in reduced_vals:
counter[val] += 1
except KeyError:
raise ValueError("'{}' not found. Try a new query.".format(q))
return counter
c = serial_reduction(iterable, "ma M young".split())
c.most_common()
# [('student', 2), ('PG', 2)]
serial_reduction(iterable, "ma M young teacher".split())
# ValueError: 'teacher' not found. Try a new query.
This is one non-pandas solution. Counter.most_common()
orders results by most common descending counts.
from collections import Counter
def get_top(maritalstatus=None, gender=None, age_range=None, occ=None):
cols = ['maritalstatus', 'gender', 'age_range', 'occ']
values = [maritalstatus, gender, age_range, occ]
c = Counter(df.query(' & '.join((('({0} == "{1}")').format(i, j)) \
for i, j in zip(cols, values) if j))['rating'])
return c.most_common()
get_top(maritalstatus='ma', gender='M', age_range='young') # [('PG', 2)]