In [12]: time_zones = [rec['tz'] for rec in records] --------------------------------------------------------------------------- KeyError Traceback (most recent call last) <ipython-input-12-db4fbd348da9> in <module>() ----> 1 time_zones = [rec['tz'] for rec in records] <ipython-input-12-db4fbd348da9> in <listcomp>(.0) ----> 1 time_zones = [rec['tz'] for rec in records] KeyError: 'tz'
# Rearrange the data for plotting In [58]: count_subset = count_subset.stack()
In [59]: count_subset.name = 'total'
In [60]: count_subset = count_subset.reset_index()
In [61]: count_subset[:10] Out[61]: tz os total 0 America/Sao_Paulo Not Windows 13.0 1 America/Sao_Paulo Windows 20.0 2 Europe/Madrid Not Windows 16.0 3 Europe/Madrid Windows 19.0 4 Pacific/Honolulu Not Windows 0.0 5 Pacific/Honolulu Windows 36.0 6 Asia/Tokyo Not Windows 2.0 7 Asia/Tokyo Windows 35.0 8 Europe/London Not Windows 43.0 9 Europe/London Windows 31.0
In [62]: sns.barplot(x='total', y='tz', hue='os', data=count_subset)
这张图不容易看出Windows用户在小分组中的相对比例,因此标准化分组百分比之和为1:
1 2 3 4 5
defnorm_total(group): group['normed_total'] = group.total / group.total.sum() return group
In [71]: movies[:5] Out[71]: movie_id title genres 01 Toy Story (1995) Animation|Children's|Comedy 1 2 Jumanji (1995) Adventure|Children's|Fantasy 23 Grumpier Old Men (1995) Comedy|Romance 34 Waiting to Exhale (1995) Comedy|Drama 45 Father of the Bride Part II (1995) Comedy
In [73]: data = pd.merge(pd.merge(ratings, users), movies)
In [74]: data Out[74]: user_id movie_id rating timestamp gender age occupation zip \ 0111935978300760 F 11048067 1211935978298413 M 561670072 21211934978220179 M 251232793 31511934978199279 M 25722903 41711935978158471 M 50195350 ... ... ... ... ... ... ... ... ... 1000204594921985958846401 M 181747901 1000205567527033976029116 M 351430030 1000206578028451958153068 M 181792886 1000207585136075957756608 F 182055410 1000208593829094957273353 M 25135401 title genres 0 One Flew Over the Cuckoo's Nest (1975) Drama 1 One Flew Over the Cuckoo's Nest (1975) Drama 2 One Flew Over the Cuckoo's Nest (1975) Drama 3 One Flew Over the Cuckoo's Nest (1975) Drama 4 One Flew Over the Cuckoo's Nest (1975) Drama ... ... ... 1000204 Modulations (1998) Documentary 1000205 Broken Vessels (1998) Drama 1000206 White Boys (1999) Drama 1000207 One Little Indian (1973) Comedy|Drama|Western 1000208 Five Wives, Three Secretaries and Me (1998) Documentary [1000209 rows x 10 columns] In [75]: data.iloc[0] Out[75]: user_id 1 movie_id 1193 rating 5 timestamp 978300760 gender F age 1 occupation 10 zip 48067 title One Flew Over the Cuckoo's Nest (1975) genres Drama Name: 0, dtype: object
为了按性别计算每部电影的平均得分,我们可以使用pivot_table方法:
1 2 3 4 5 6 7 8 9 10 11 12
In [76]: mean_ratings = data.pivot_table('rating', index='title', ....: columns='gender', aggfunc='mean')
In [77]: mean_ratings[:5] Out[77]: gender F M title $1,000,000 Duck (1971) 3.3750002.761905 'Night Mother (1986) 3.388889 3.352941 'Til There Was You (1997) 2.6756762.733333 'burbs, The (1989) 2.793478 2.962085 ...And Justice for All (1979) 3.828571 3.689024
In [78]: ratings_by_title = data.groupby('title').size()
In [79]: ratings_by_title[:10] Out[79]: title $1,000,000 Duck (1971) 37 'Night Mother (1986) 70 'Til There Was You (1997) 52 'burbs, The (1989) 303 ...And Justice for All (1979) 199 1-900 (1994) 2 10 Things I Hate About You (1999) 700 101 Dalmatians (1961) 565 101 Dalmatians (1996) 364 12 Angry Men (1957) 616 dtype: int64 In [80]: active_titles = ratings_by_title.index[ratings_by_title >= 250] In [81]: active_titles Out[81]: Index([''burbs, The (1989)', '10 Things I Hate About You (1999)', '101 Dalmatians (1961)', '101 Dalmatians (1996)', '12 Angry Men (1957)', '13th Warrior, The (1999)', '2 Days in the Valley (1996)', '20,000 Leagues Under the Sea (1954)', '2001: A Space Odyssey (1968)', '2010 (1984)', ... 'X-Men (2000)', 'Year of Living Dangerously (1982)', 'Yellow Submarine (1968)', 'You've Got Mail (1998)', 'Young Frankenstein (1974)', 'Young Guns (1988)', 'Young Guns II (1990)', 'Young Sherlock Holmes (1985)', 'Zero Effect (1998)', 'eXistenZ (1999)'], dtype='object', name='title', length=1216)
# Select rows on the index In [82]: mean_ratings = mean_ratings.loc[active_titles]
In [83]: mean_ratings Out[83]: gender F M title 'burbs, The (1989) 2.793478 2.962085 10 Things I Hate About You (1999) 3.646552 3.311966 101 Dalmatians (1961) 3.791444 3.500000 101 Dalmatians (1996) 3.240000 2.911215 12 Angry Men (1957) 4.184397 4.328421 ... ... ... Young Guns (1988) 3.371795 3.425620 Young Guns II (1990) 2.934783 2.904025 Young Sherlock Holmes (1985) 3.514706 3.363344 Zero Effect (1998) 3.864407 3.723140 eXistenZ (1999) 3.098592 3.289086 [1216 rows x 2 columns]
为了了解女性观众最喜欢的电影,我们可以对F列降序排列:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
In [85]: top_female_ratings = mean_ratings.sort_values(by='F', ascending=False)
In [86]: top_female_ratings[:10] Out[86]: gender F M title Close Shave, A (1995) 4.6444444.473795 Wrong Trousers, The (1993) 4.5882354.478261 Sunset Blvd. (a.k.a. Sunset Boulevard) (1950) 4.5726504.464589 Wallace & Gromit: The Best of Aardman Animation... 4.5631074.385075 Schindler's List (1993) 4.562602 4.491415 Shawshank Redemption, The (1994) 4.539075 4.560625 Grand Day Out, A (1992) 4.537879 4.293255 To Kill a Mockingbird (1962) 4.536667 4.372611 Creature Comforts (1990) 4.513889 4.272277 Usual Suspects, The (1995) 4.513317 4.518248
In [87]: mean_ratings['diff'] = mean_ratings['M'] - mean_ratings['F']
按"diff"排序即可得到分歧最大且女性观众更喜欢的电影:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
In [88]: sorted_by_diff = mean_ratings.sort_values(by='diff')
In [89]: sorted_by_diff[:10] Out[89]: gender F M diff title Dirty Dancing (1987) 3.7903782.959596 -0.830782 Jumpin' Jack Flash (1986) 3.254717 2.578358 -0.676359 Grease (1978) 3.975265 3.367041 -0.608224 Little Women (1994) 3.870588 3.321739 -0.548849 Steel Magnolias (1989) 3.901734 3.365957 -0.535777 Anastasia (1997) 3.800000 3.281609 -0.518391 Rocky Horror Picture Show, The (1975) 3.673016 3.160131 -0.512885 Color Purple, The (1985) 4.158192 3.659341 -0.498851 Age of Innocence, The (1993) 3.827068 3.339506 -0.487561 Free Willy (1993) 2.921348 2.438776 -0.482573
对排序结果反序并取出前10行,得到的则是男性观众更喜欢的电影:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
# Reverse order of rows, take first 10 rows In [90]: sorted_by_diff[::-1][:10] Out[90]: gender F M diff title Good, The Bad and The Ugly, The (1966) 3.4949494.2213000.726351 Kentucky Fried Movie, The (1977) 2.8787883.5551470.676359 Dumb & Dumber (1994) 2.6979873.3365950.638608 Longest Day, The (1962) 3.4117654.0314470.619682 Cable Guy, The (1996) 2.2500002.8637870.613787 Evil Dead II (Dead By Dawn) (1987) 3.2972973.9092830.611985 Hidden, The (1987) 3.1379313.7450980.607167 Rocky III (1982) 2.3617022.9435030.581801 Caddyshack (1980) 3.3961353.9697370.573602 For a Few Dollars More (1965) 3.4090913.9537950.544704
# Standard deviation of rating grouped by title In [91]: rating_std_by_title = data.groupby('title')['rating'].std()
# Filter down to active_titles In [92]: rating_std_by_title = rating_std_by_title.loc[active_titles]
# Order Series by value in descending order In [93]: rating_std_by_title.sort_values(ascending=False)[:10] Out[93]: title Dumb & Dumber (1994) 1.321333 Blair Witch Project, The (1999) 1.316368 Natural Born Killers (1994) 1.307198 Tank Girl (1995) 1.277695 Rocky Horror Picture Show, The (1975) 1.260177 Eyes Wide Shut (1999) 1.259624 Evita (1996) 1.253631 Billy Madison (1995) 1.249970 Fear and Loathing in Las Vegas (1998) 1.246408 Bicentennial Man (1999) 1.245533 Name: rating, dtype: float64
In [4]: names.head(10) Out[4]: name sex births year 0 Mary F 70651880 1 Anna F 26041880 2 Emma F 20031880 3 Elizabeth F 19391880 4 Minnie F 17461880 5 Margaret F 15781880 6 Ida F 14721880 7 Alice F 14141880 8 Bertha F 13201880 9 Sarah F 12881880
In [96]: names1880 = pd.read_csv('datasets/babynames/yob1880.txt', ....: names=['name', 'sex', 'births'])
In [97]: names1880 Out[97]: name sex births 0 Mary F 7065 1 Anna F 2604 2 Emma F 2003 3 Elizabeth F 1939 4 Minnie F 1746 ... ... .. ... 1995 Woodie M 5 1996 Worthy M 5 1997 Wright M 5 1998 York M 5 1999 Zachariah M 5 [2000 rows x 3 columns]
In [100]: names Out[100]: name sex births year 0 Mary F 70651880 1 Anna F 26041880 2 Emma F 20031880 3 Elizabeth F 19391880 4 Minnie F 17461880 ... ... .. ... ... 1690779 Zymaire M 52010 1690780 Zyonne M 52010 1690781 Zyquarius M 52010 1690782 Zyran M 52010 1690783 Zzyzx M 52010 [1690784 rows x 4 columns]
In [105]: names Out[105]: name sex births year prop 0 Mary F 706518800.077643 1 Anna F 260418800.028618 2 Emma F 200318800.022013 3 Elizabeth F 193918800.021309 4 Minnie F 174618800.019188 ... ... .. ... ... ... 1690779 Zymaire M 520100.000003 1690780 Zyonne M 520100.000003 1690781 Zyquarius M 520100.000003 1690782 Zyran M 520100.000003 1690783 Zzyzx M 520100.000003 [1690784 rows x 5 columns]
在执行这样的分组处理时,一般都应该做一些有效性检查,比如验证所有分组的prop的总和是否为1:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
In [106]: names.groupby(['year', 'sex']).prop.sum() Out[106]: year sex 1880 F 1.0 M 1.0 1881 F 1.0 M 1.0 1882 F 1.0 ... 2008 M 1.0 2009 F 1.0 M 1.0 2010 F 1.0 M 1.0 Name: prop, Length: 262, dtype: float64
defget_top1000(group): return group.sort_values(by='births', ascending=False)[:1000] grouped = names.groupby(['year', 'sex']) top1000 = grouped.apply(get_top1000) # Drop the group index, not needed top1000.reset_index(inplace=True, drop=True)
如果你喜欢DIY的话,也可以这样:
1 2 3 4
pieces = [] for year, group in names.groupby(['year', 'sex']): pieces.append(group.sort_values(by='births', ascending=False)[:1000]) top1000 = pd.concat(pieces, ignore_index=True)
现在的结果数据集就小多了:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
In [108]: top1000 Out[108]: name sex births year prop 0 Mary F 706518800.077643 1 Anna F 260418800.028618 2 Emma F 200318800.022013 3 Elizabeth F 193918800.021309 4 Minnie F 174618800.019188 ... ... .. ... ... ... 261872 Camilo M 19420100.000102 261873 Destin M 19420100.000102 261874 Jaquan M 19420100.000102 261875 Jaydan M 19420100.000102 261876 Maxton M 19320100.000102 [261877 rows x 5 columns]
In [119]: df Out[119]: name sex births year prop 260877 Jacob M 2187520100.011523 260878 Ethan M 1786620100.009411 260879 Michael M 1713320100.009025 260880 Jayden M 1703020100.008971 260881 William M 1687020100.008887 ... ... .. ... ... ... 261872 Camilo M 19420100.000102 261873 Destin M 19420100.000102 261874 Jaquan M 19420100.000102 261875 Jaydan M 19420100.000102 261876 Maxton M 19320100.000102 [1000 rows x 5 columns]
# extract last letter from name column get_last_letter = lambda x: x[-1] last_letters = names.name.map(get_last_letter) last_letters.name = 'last_letter'
In [131]: subtable = table.reindex(columns=[1910, 1960, 2010], level='year')
In [132]: subtable.head() Out[132]: sex F M year 191019602010191019602010 last_letter a 108376.0691247.0670605.0977.05204.028438.0 b NaN 694.0450.0411.03912.038859.0 c 5.049.0946.0482.015476.023125.0 d 6750.03729.02607.022111.0262112.044398.0 e 133569.0435013.0313833.028655.0178823.0129012.0
In [133]: subtable.sum() Out[133]: sex year F 1910396416.0 19602022062.0 20101759010.0 M 1910194198.0 19602132588.0 20101898382.0 dtype: float64
In [134]: letter_prop = subtable / subtable.sum()
In [135]: letter_prop Out[135]: sex F M year 191019602010191019602010 last_letter a 0.2733900.3418530.3812400.0050310.0024400.014980 b NaN 0.0003430.0002560.0021160.0018340.020470 c 0.0000130.0000240.0005380.0024820.0072570.012181 d 0.0170280.0018440.0014820.1138580.1229080.023387 e 0.3369410.2151330.1784150.1475560.0838530.067959 ... ... ... ... ... ... ... v NaN 0.0000600.0001170.000113 0.0000370.001434 w 0.0000200.0000310.0011820.0063290.0077110.016148 x 0.0000150.0000370.0007270.0039650.0018510.008614 y 0.1109720.1525690.1168280.0773490.1609870.058168 z 0.0024390.0006590.0007040.0001700.0001840.001831 [26 rows x 6 columns]
In [139]: dny_ts = letter_prop.loc[['d', 'n', 'y'], 'M'].T
In [140]: dny_ts.head() Out[140]: last_letter d n y year 18800.0830550.1532130.075760 18810.0832470.1532140.077451 18820.0853400.1495600.077537 18830.0840660.1516460.079144 18840.0861200.1499150.080405
In [159]: nutrients = pd.DataFrame(db[0]['nutrients'])
In [160]: nutrients[:7] Out[160]: description group units value 0 Protein Composition g 25.18 1 Total lipid (fat) Composition g 29.20 2 Carbohydrate, by difference Composition g 3.06 3 Ash Other g 3.28 4 Energy Energy kcal 376.00 5 Water Composition g 39.28 6 Energy Energy kJ 1573.00
In [167]: nutrients Out[167]: description group units value id 0 Protein Composition g 25.1801008 1 Total lipid (fat) Composition g 29.2001008 2 Carbohydrate, by difference Composition g 3.0601008 3 Ash Other g 3.2801008 4 Energy Energy kcal 376.0001008 ... ... ... ... ... ... 389350 Vitamin B-12, added Vitamins mcg 0.00043546 389351 Cholesterol Other mg 0.00043546 389352 Fatty acids, total saturated Other g 0.07243546 389353 Fatty acids, total monounsaturated Other g 0.02843546 389354 Fatty acids, total polyunsaturated Other g 0.04143546 [389355 rows x 5 columns]
我发现这个DataFrame中无论如何都会有一些重复项,所以直接丢弃就可以了:
1 2 3 4
In [168]: nutrients.duplicated().sum() # number of duplicates Out[168]: 14179
In [173]: col_mapping = {'description' : 'nutrient', .....: 'group' : 'nutgroup'} In [174]: nutrients = nutrients.rename(columns=col_mapping, copy=False)
In [175]: nutrients Out[175]: nutrient nutgroup units value id 0 Protein Composition g 25.1801008 1 Total lipid (fat) Composition g 29.2001008 2 Carbohydrate, by difference Composition g 3.0601008 3 Ash Other g 3.2801008 4 Energy Energy kcal 376.0001008 ... ... ... ... ... ... 389350 Vitamin B-12, added Vitamins mcg 0.00043546 389351 Cholesterol Other mg 0.00043546 389352 Fatty acids, total saturated Other g 0.07243546 389353 Fatty acids, total monounsaturated Other g 0.02843546 389354 Fatty acids, total polyunsaturated Other g 0.04143546 [375176 rows x 5 columns]
In [220]: grouped = fec_mrbo.groupby(['cand_nm', 'contbr_st'])
In [221]: totals = grouped.contb_receipt_amt.sum().unstack(0).fillna(0)
In [222]: totals = totals[totals.sum(1) > 100000]
In [223]: totals[:10] Out[223]: cand_nm Obama, Barack Romney, Mitt contbr_st AK 281840.1586204.24 AL 543123.48527303.51 AR 359247.28105556.00 AZ 1506476.981888436.23 CA 23824984.2411237636.60 CO 2132429.491506714.12 CT 2068291.263499475.45 DC 4373538.801025137.50 DE 336669.1482712.00 FL 7318178.588338458.81
如果对各行除以总赞助额,就会得到各候选人在各州的总赞助额比例:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
In [224]: percent = totals.div(totals.sum(1), axis=0)
In [225]: percent[:10] Out[225]: cand_nm Obama, Barack Romney, Mitt contbr_st AK 0.7657780.234222 AL 0.5073900.492610 AR 0.7729020.227098 AZ 0.4437450.556255 CA 0.6794980.320502 CO 0.5859700.414030 CT 0.3714760.628524 DC 0.8101130.189887 DE 0.8027760.197224 FL 0.4674170.532583