From 4437b0e3f0af8930830f4c79fad1e36a437a634a Mon Sep 17 00:00:00 2001 From: wizardforcel <562826179@qq.com> Date: Sat, 29 Dec 2018 15:58:46 +0800 Subject: [PATCH] 20 --- 20.md | 986 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 986 insertions(+) create mode 100644 20.md diff --git a/20.md b/20.md new file mode 100644 index 0000000..c9966c0 --- /dev/null +++ b/20.md @@ -0,0 +1,986 @@ +# 二十、数据可视化 + +> 作者:[Chris Albon](https://chrisalbon.com/) +> +> 译者:[飞龙](https://github.com/wizardforcel) +> +> 协议:[CC BY-NC-SA 4.0](http://creativecommons.org/licenses/by-nc-sa/4.0/) + +## MatPlotLib 中的双向条形图 + +``` +%matplotlib inline +import pandas as pd +import matplotlib.pyplot as plt +import numpy as np + +# 创建数据帧 +raw_data = {'first_name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'], + 'pre_score': [4, 24, 31, 2, 3], + 'mid_score': [25, 94, 57, 62, 70], + 'post_score': [5, 43, 23, 23, 51]} +df = pd.DataFrame(raw_data, columns = ['first_name', 'pre_score', 'mid_score', 'post_score']) +df +``` + +| | first_name | pre_score | mid_score | post_score | +| --- | --- | --- | --- | --- | +| 0 | Jason | 4 | 25 | 5 | +| 1 | Molly | 24 | 94 | 43 | +| 2 | Tina | 31 | 57 | 23 | +| 3 | Jake | 2 | 62 | 23 | +| 4 | Amy | 3 | 70 | 51 | + +``` +# 输入数据,特别是第二和 +# 第三行,跳过第一列 +x1 = df.ix[1, 1:] +x2 = df.ix[2, 1:] + +# 创建条形标签 +bar_labels = ['Pre Score', 'Mid Score', 'Post Score'] + +# 创建图形 +fig = plt.figure(figsize=(8,6)) + +# 设置 y 的位置 +y_pos = np.arange(len(x1)) +y_pos = [x for x in y_pos] +plt.yticks(y_pos, bar_labels, fontsize=10) + +# 在 y_pos 的位置上创建水平条形 +plt.barh(y_pos, + # 使用数据 x1 + x1, + # 中心对齐 + align='center', + # 透明度为 0.4 + alpha=0.4, + # 颜色为绿色 + color='#263F13') + +# 在 y_pos 的位置上创建水平条形 +plt.barh(y_pos, + # 使用数据 -x2 + -x2, + # 中心对齐 + align='center', + # 透明度为 0.4 + alpha=0.4, + # 颜色为绿色 + color='#77A61D') + +# 注解和标签 +plt.xlabel('Tina\'s Score: Light Green. Molly\'s Score: Dark Green') +t = plt.title('Comparison of Molly and Tina\'s Score') +plt.ylim([-1,len(x1)+0.1]) +plt.xlim([-max(x2)-10, max(x1)+10]) +plt.grid() + +plt.show() +``` + +![png](https://chrisalbon.com/python/data_visualization/matplotlib_back_to_back_bar_plot_6_0.png) + +## MatPlotLib 中的条形图 + +``` +%matplotlib inline +import pandas as pd +import matplotlib.pyplot as plt +import numpy as np + +# 创建数据帧 +raw_data = {'first_name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'], + 'pre_score': [4, 24, 31, 2, 3], + 'mid_score': [25, 94, 57, 62, 70], + 'post_score': [5, 43, 23, 23, 51]} +df = pd.DataFrame(raw_data, columns = ['first_name', 'pre_score', 'mid_score', 'post_score']) +df +``` + +| | first_name | pre_score | mid_score | post_score | +| --- | --- | --- | --- | --- | +| 0 | Jason | 4 | 25 | 5 | +| 1 | Molly | 24 | 94 | 43 | +| 2 | Tina | 31 | 57 | 23 | +| 3 | Jake | 2 | 62 | 23 | +| 4 | Amy | 3 | 70 | 51 | + +``` +# 为每个变量创建得分均值的列表 +mean_values = [df['pre_score'].mean(), df['mid_score'].mean(), df['post_score'].mean()] + +# 创建变动列表,设为得分上下 .25 +variance = [df['pre_score'].mean() * 0.25, df['pre_score'].mean() * 0.25, df['pre_score'].mean() * 0.25] + +# 设置条形标签 +bar_labels = ['Pre Score', 'Mid Score', 'Post Score'] + +# 创建条形的 x 位置 +x_pos = list(range(len(bar_labels))) + +# 在 x 位置上创建条形图 +plt.bar(x_pos, + # 使用 mean_values 中的数据 + mean_values, + # y-error 直线设置为变动 + yerr=variance, + # 中心对齐 + align='center', + # 颜色 + color='#FFC222', + # 透明度为 0.5 + alpha=0.5) + +# 添加网格 +plt.grid() + +# 设置 y 轴高度 +max_y = max(zip(mean_values, variance)) # returns a tuple, here: (3, 5) +plt.ylim([0, (max_y[0] + max_y[1]) * 1.1]) + +# 设置轴标签和标题 +plt.ylabel('Score') +plt.xticks(x_pos, bar_labels) +plt.title('Mean Scores For Each Test') + +plt.show() +``` + +![png](https://chrisalbon.com/python/data_visualization/matplotlib_bar_plot_6_0.png) + +## Seaborn 中的调色板 + +``` +import pandas as pd +%matplotlib inline +import matplotlib.pyplot as plt +import seaborn as sns + +# 创建数据帧 +data = {'date': ['2014-05-01 18:47:05.069722', '2014-05-01 18:47:05.119994', '2014-05-02 18:47:05.178768', '2014-05-02 18:47:05.230071', '2014-05-02 18:47:05.230071', '2014-05-02 18:47:05.280592', '2014-05-03 18:47:05.332662', '2014-05-03 18:47:05.385109', '2014-05-04 18:47:05.436523', '2014-05-04 18:47:05.486877'], + 'deaths_regiment_1': [34, 43, 14, 15, 15, 14, 31, 25, 62, 41], + 'deaths_regiment_2': [52, 66, 78, 15, 15, 5, 25, 25, 86, 1], + 'deaths_regiment_3': [13, 73, 82, 58, 52, 87, 26, 5, 56, 75], + 'deaths_regiment_4': [44, 75, 26, 15, 15, 14, 54, 25, 24, 72], + 'deaths_regiment_5': [25, 24, 25, 15, 57, 68, 21, 27, 62, 5], + 'deaths_regiment_6': [84, 84, 26, 15, 15, 14, 26, 25, 62, 24], + 'deaths_regiment_7': [46, 57, 26, 15, 15, 14, 26, 25, 62, 41]} +df = pd.DataFrame(data, columns = ['date', 'battle_deaths', 'deaths_regiment_1', 'deaths_regiment_2', + 'deaths_regiment_3', 'deaths_regiment_4', 'deaths_regiment_5', + 'deaths_regiment_6', 'deaths_regiment_7']) +df = df.set_index(df.date) + +sns.palplot(sns.color_palette("deep", 10)) +``` + +![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes_5_0.png) + +``` +sns.palplot(sns.color_palette("muted", 10)) +``` + +![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes_6_0.png) + +``` +sns.palplot(sns.color_palette("bright", 10)) +``` + +![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes_7_0.png) + +``` +sns.palplot(sns.color_palette("dark", 10)) +``` + +![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes_8_0.png) + +``` +sns.palplot(sns.color_palette("colorblind", 10)) +``` + +![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes_9_0.png) + +``` +sns.palplot(sns.color_palette("Paired", 10)) +``` + +![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes_10_0.png) + +``` +sns.palplot(sns.color_palette("BuGn", 10)) +``` + +![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes_11_0.png) + +``` +sns.palplot(sns.color_palette("GnBu", 10)) +``` + +![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes_12_0.png) + +``` +sns.palplot(sns.color_palette("OrRd", 10)) +``` + +![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes_13_0.png) + +``` +sns.palplot(sns.color_palette("PuBu", 10)) +``` + +![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes_14_0.png) + +``` +sns.palplot(sns.color_palette("YlGn", 10)) +``` + +![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes_15_0.png) + +``` +sns.palplot(sns.color_palette("YlGnBu", 10)) +``` + +![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes_16_0.png) + +``` +sns.palplot(sns.color_palette("YlOrBr", 10)) +``` + +![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes_17_0.png) + +``` +sns.palplot(sns.color_palette("YlOrRd", 10)) +``` + +![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes_18_0.png) + +``` +sns.palplot(sns.color_palette("BrBG", 10)) +``` + +![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes_19_0.png) + +``` +sns.palplot(sns.color_palette("PiYG", 10)) +``` + +![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes_20_0.png) + +``` +sns.palplot(sns.color_palette("PRGn", 10)) +``` + +![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes_21_0.png) + +``` +sns.palplot(sns.color_palette("PuOr", 10)) +``` + +![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes_22_0.png) + +``` +sns.palplot(sns.color_palette("RdBu", 10)) +``` + +![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes_23_0.png) + +``` +sns.palplot(sns.color_palette("RdGy", 10)) +``` + +![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes_24_0.png) + +``` +sns.palplot(sns.color_palette("RdYlBu", 10)) +``` + +![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes_25_0.png) + +``` +sns.palplot(sns.color_palette("RdYlGn", 10)) +``` + +![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes_26_0.png) + +``` +sns.palplot(sns.color_palette("Spectral", 10)) +``` + +![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes_27_0.png) + +``` +# 创建调色板并将其设为当前调色板 +flatui = ["#9b59b6", "#3498db", "#95a5a6", "#e74c3c", "#34495e", "#2ecc71"] +sns.set_palette(flatui) +sns.palplot(sns.color_palette()) +``` + +![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes_29_0.png) + +``` +# 设置绘图颜色 +sns.tsplot([df.deaths_regiment_1, df.deaths_regiment_2, df.deaths_regiment_3, df.deaths_regiment_4, + df.deaths_regiment_5, df.deaths_regiment_6, df.deaths_regiment_7], color="#34495e") + +# +``` + +![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes_31_1.png) + +## 使用 Seaborn 和 pandas 创建时间序列绘图 + +``` +import pandas as pd +%matplotlib inline +import matplotlib.pyplot as plt +import seaborn as sns + +data = {'date': ['2014-05-01 18:47:05.069722', '2014-05-01 18:47:05.119994', '2014-05-02 18:47:05.178768', '2014-05-02 18:47:05.230071', '2014-05-02 18:47:05.230071', '2014-05-02 18:47:05.280592', '2014-05-03 18:47:05.332662', '2014-05-03 18:47:05.385109', '2014-05-04 18:47:05.436523', '2014-05-04 18:47:05.486877'], + 'deaths_regiment_1': [34, 43, 14, 15, 15, 14, 31, 25, 62, 41], + 'deaths_regiment_2': [52, 66, 78, 15, 15, 5, 25, 25, 86, 1], + 'deaths_regiment_3': [13, 73, 82, 58, 52, 87, 26, 5, 56, 75], + 'deaths_regiment_4': [44, 75, 26, 15, 15, 14, 54, 25, 24, 72], + 'deaths_regiment_5': [25, 24, 25, 15, 57, 68, 21, 27, 62, 5], + 'deaths_regiment_6': [84, 84, 26, 15, 15, 14, 26, 25, 62, 24], + 'deaths_regiment_7': [46, 57, 26, 15, 15, 14, 26, 25, 62, 41]} +df = pd.DataFrame(data, columns = ['date', 'battle_deaths', 'deaths_regiment_1', 'deaths_regiment_2', + 'deaths_regiment_3', 'deaths_regiment_4', 'deaths_regiment_5', + 'deaths_regiment_6', 'deaths_regiment_7']) +df = df.set_index(df.date) + +sns.tsplot([df.deaths_regiment_1, df.deaths_regiment_2, df.deaths_regiment_3, df.deaths_regiment_4, + df.deaths_regiment_5, df.deaths_regiment_6, df.deaths_regiment_7], color="indianred") + +# +``` + +![png](https://chrisalbon.com/python/data_visualization/seaborn_pandas_timeseries_plot_5_1.png) + +``` +# 带有置信区间直线,但是没有直线的时间序列绘图 +sns.tsplot([df.deaths_regiment_1, df.deaths_regiment_2, df.deaths_regiment_3, df.deaths_regiment_4, + df.deaths_regiment_5, df.deaths_regiment_6, df.deaths_regiment_7], err_style="ci_bars", interpolate=False) + +# +``` + +![png](https://chrisalbon.com/python/data_visualization/seaborn_pandas_timeseries_plot_7_1.png) + +## 使用 Seaborn 创建散点图 + +``` +import pandas as pd +%matplotlib inline +import random +import matplotlib.pyplot as plt +import seaborn as sns + +# 创建空数据帧 +df = pd.DataFrame() + +# 添加列 +df['x'] = random.sample(range(1, 1000), 5) +df['y'] = random.sample(range(1, 1000), 5) +df['z'] = [1,0,0,1,0] +df['k'] = ['male','male','male','female','female'] + +# 查看前几行数据 +df.head() +``` + +| | x | y | z | k | +| --- | --- | --- | --- | --- | +| 0 | 466 | 948 | 1 | male | +| 1 | 832 | 481 | 0 | male | +| 2 | 978 | 465 | 0 | male | +| 3 | 510 | 206 | 1 | female | +| 4 | 848 | 357 | 0 | female | + +``` +# 设置散点图样式 +sns.set_context("notebook", font_scale=1.1) +sns.set_style("ticks") + +# 创建数据帧的散点图 +sns.lmplot('x', # 横轴 + 'y', # 纵轴 + data=df, # 数据源 + fit_reg=False, # 不要拟合回归直线 + hue="z", # 设置颜色 + scatter_kws={"marker": "D", # 设置标记样式 + "s": 100}) # 设置标记大小 + +# 设置标题 +plt.title('Histogram of IQ') + +# 设置横轴标签 +plt.xlabel('Time') + +# 设置纵轴标签 +plt.ylabel('Deaths') + +# +``` + +![png](https://chrisalbon.com/python/data_visualization/seaborn_scatterplot_7_1.png) + +## MatPlotLib 中的分组条形图 + +``` +%matplotlib inline +import pandas as pd +import matplotlib.pyplot as plt +import numpy as np + +raw_data = {'first_name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'], + 'pre_score': [4, 24, 31, 2, 3], + 'mid_score': [25, 94, 57, 62, 70], + 'post_score': [5, 43, 23, 23, 51]} +df = pd.DataFrame(raw_data, columns = ['first_name', 'pre_score', 'mid_score', 'post_score']) +df +``` + +| | first_name | pre_score | mid_score | post_score | +| --- | --- | --- | --- | --- | +| 0 | Jason | 4 | 25 | 5 | +| 1 | Molly | 24 | 94 | 43 | +| 2 | Tina | 31 | 57 | 23 | +| 3 | Jake | 2 | 62 | 23 | +| 4 | Amy | 3 | 70 | 51 | + +``` +# 设置条形的位置和宽度 +pos = list(range(len(df['pre_score']))) +width = 0.25 + +# 绘制条形 +fig, ax = plt.subplots(figsize=(10,5)) + +# 使用 pre_score 数据, +# 在位置 pos 上创建条形 +plt.bar(pos, + # 使用数据 df['pre_score'] + df['pre_score'], + # 宽度 + width, + # 透明度为 0.5 + alpha=0.5, + # 颜色 + color='#EE3224', + # 标签是 first_name 的第一个值 + label=df['first_name'][0]) + +# 使用 mid_score 数据, +# 在位置 pos + 一定宽度上创建条形 +plt.bar([p + width for p in pos], + # 使用数据 df['mid_score'] + df['mid_score'], + # 宽度 + width, + # 透明度为 0.5 + alpha=0.5, + # 颜色 + color='#F78F1E', + # 标签是 first_name 的第二个值 + label=df['first_name'][1]) + +# 使用 post_score 数据, +# 在位置 pos + 一定宽度上创建条形 +plt.bar([p + width*2 for p in pos], + # 使用数据 df['post_score'] + df['post_score'], + # 宽度 + width, + # 透明度为 0.5 + alpha=0.5, + # 颜色 + color='#FFC222', + # 标签是 first_name 的第三个值 + label=df['first_name'][2]) + +# 设置纵轴标签 +ax.set_ylabel('Score') + +# 设置标题 +ax.set_title('Test Subject Scores') + +# 设置 x 刻度的位置 +ax.set_xticks([p + 1.5 * width for p in pos]) + +# 设置 x 刻度的标签 +ax.set_xticklabels(df['first_name']) + +# 设置横轴和纵轴的区域 +plt.xlim(min(pos)-width, max(pos)+width*4) +plt.ylim([0, max(df['pre_score'] + df['mid_score'] + df['post_score'])] ) + +# 添加图例并展示绘图 +plt.legend(['Pre Score', 'Mid Score', 'Post Score'], loc='upper left') +plt.grid() +plt.show() +``` + +![png](https://chrisalbon.com/python/data_visualization/matplotlib_grouped_bar_plot_6_0.png) + +## MatPlotLib 中的直方图 + +``` +%matplotlib inline +import pandas as pd +import matplotlib.pyplot as plt +import numpy as np +import math + +# 设置 ipython 的最大行数 +pd.set_option('display.max_row', 1000) + +# 将 ipython 的最大列宽设为 50 +pd.set_option('display.max_columns', 50) + +df = pd.read_csv('https://www.dropbox.com/s/52cb7kcflr8qm2u/5kings_battles_v1.csv?dl=1') +df.head() +``` + +| | name | year | battle_number | attacker_king | defender_king | attacker_1 | attacker_2 | attacker_3 | attacker_4 | defender_1 | defender_2 | defender_3 | defender_4 | attacker_outcome | battle_type | major_death | major_capture | attacker_size | defender_size | attacker_commander | defender_commander | summer | location | region | note | +| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | +| 0 | Battle of the Golden Tooth | 298 | 1 | Joffrey/Tommen Baratheon | Robb Stark | Lannister | NaN | NaN | NaN | Tully | NaN | NaN | NaN | win | pitched battle | 1 | 0 | 15000 | 4000 | Jaime Lannister | Clement Piper, Vance | 1 | Golden Tooth | The Westerlands | NaN | +| 1 | Battle at the Mummer's Ford | 298 | 2 | Joffrey/Tommen Baratheon | Robb Stark | Lannister | NaN | NaN | NaN | Baratheon | NaN | NaN | NaN | win | ambush | 1 | 0 | NaN | 120 | Gregor Clegane | Beric Dondarrion | 1 | Mummer's Ford | The Riverlands | NaN | +| 2 | Battle of Riverrun | 298 | 3 | Joffrey/Tommen Baratheon | Robb Stark | Lannister | NaN | NaN | NaN | Tully | NaN | NaN | NaN | win | pitched battle | 0 | 1 | 15000 | 10000 | Jaime Lannister, Andros Brax | Edmure Tully, Tytos Blackwood | 1 | Riverrun | The Riverlands | NaN | +| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | +| 3 | Battle of the Green Fork | 298 | 4 | Robb Stark | Joffrey/Tommen Baratheon | Stark | NaN | NaN | NaN | Lannister | NaN | NaN | NaN | loss | pitched battle | 1 | 1 | 18000 | 20000 | Roose Bolton, Wylis Manderly, Medger Cerwyn, H... | Tywin Lannister, Gregor Clegane, Kevan Lannist... | 1 | Green Fork | The Riverlands | NaN | +| 4 | Battle of the Whispering Wood | 298 | 5 | Robb Stark | Joffrey/Tommen Baratheon | Stark | Tully | NaN | NaN | Lannister | NaN | NaN | NaN | win | ambush | 1 | 1 | 1875 | 6000 | Robb Stark, Brynden Tully | Jaime Lannister | 1 | Whispering Wood | The Riverlands | NaN | + +``` +# 制作攻击方和防守方大小的两个变量 +# 但是当有超过 10000 个攻击方时将其排除在外 +data1 = df['attacker_size'][df['attacker_size'] < 90000] +data2 = df['defender_size'][df['attacker_size'] < 90000] + +# 创建 2000 个桶 +bins = np.arange(data1.min(), data2.max(), 2000) # 固定桶的大小 + +# 绘制攻击方大小的直方图 +plt.hist(data1, + bins=bins, + alpha=0.5, + color='#EDD834', + label='Attacker') + +# 绘制防守方大小的直方图 +plt.hist(data2, + bins=bins, + alpha=0.5, + color='#887E43', + label='Defender') + +# 设置图形的 x 和 y 边界 +plt.ylim([0, 10]) + +# 设置标题和标签 +plt.title('Histogram of Attacker and Defender Size') +plt.xlabel('Number of troops') +plt.ylabel('Number of battles') +plt.legend(loc='upper right') + +plt.show() +``` + +![png](https://chrisalbon.com/python/data_visualization/matplotlib_histogram_6_0.png) + +``` +# 制作攻击方和防守方大小的两个变量 +# 但是当有超过 10000 个攻击方时将其排除在外 +data1 = df['attacker_size'][df['attacker_size'] < 90000] +data2 = df['defender_size'][df['attacker_size'] < 90000] + +# 创建 10 个桶,最小值为 +# data1 和 data2 的最小值 +bins = np.linspace(min(data1 + data2), + # 最大值为它们的最大值 + max(data1 + data2), + # 并分为 10 个桶 + 10) + +# 绘制攻击方大小的直方图 +plt.hist(data1, + # 使用定义好的桶 + bins=bins, + # 透明度 + alpha=0.5, + # 颜色 + color='#EDD834', + # 攻击方的标签 + label='Attacker') + +# 绘制防守方大小的直方图 +plt.hist(data2, + # 使用定义好的桶 + bins=bins, + # 透明度 + alpha=0.5, + # 颜色 + color='#887E43', + # 防守方的标签 + label='Defender') + +# 设置图形的 x 和 y 边界 +plt.ylim([0, 10]) + +# 设置标题和标签 +plt.title('Histogram of Attacker and Defender Size') +plt.xlabel('Number of troops') +plt.ylabel('Number of battles') +plt.legend(loc='upper right') + +plt.show() +``` + +![png](https://chrisalbon.com/python/data_visualization/matplotlib_histogram_8_0.png) + +## 从 Pandas 数据帧生成 MatPlotLib 散点图 + +``` +%matplotlib inline +import pandas as pd +import matplotlib.pyplot as plt +import numpy as np + +raw_data = {'first_name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'], + 'last_name': ['Miller', 'Jacobson', 'Ali', 'Milner', 'Cooze'], + 'female': [0, 1, 1, 0, 1], + 'age': [42, 52, 36, 24, 73], + 'preTestScore': [4, 24, 31, 2, 3], + 'postTestScore': [25, 94, 57, 62, 70]} +df = pd.DataFrame(raw_data, columns = ['first_name', 'last_name', 'age', 'female', 'preTestScore', 'postTestScore']) +df +``` + +| | first_name | last_name | age | female | preTestScore | postTestScore | +| --- | --- | --- | --- | --- | --- | --- | +| 0 | Jason | Miller | 42 | 0 | 4 | 25 | +| 1 | Molly | Jacobson | 52 | 1 | 24 | 94 | +| 2 | Tina | Ali | 36 | 1 | 31 | 57 | +| 3 | Jake | Milner | 24 | 0 | 2 | 62 | +| 4 | Amy | Cooze | 73 | 1 | 3 | 70 | + +``` +# preTestScore 和 postTestScore 的散点图 +# 每个点的大小取决于年龄 +plt.scatter(df.preTestScore, df.postTestScore +, s=df.age) + +# +``` + +![png](https://chrisalbon.com/python/data_visualization/matplotlib_scatterplot_from_pandas_6_1.png) + +``` +# preTestScore 和 postTestScore 的散点图 +# 大小为 300,颜色取决于性别 +plt.scatter(df.preTestScore, df.postTestScore, s=300, c=df.female) + +# +``` + +![png](https://chrisalbon.com/python/data_visualization/matplotlib_scatterplot_from_pandas_8_1.png) + +## Matplotlib 的简单示例 + +``` +# 让 Jupyter 加载 matplotlib +# 并内联创建所有绘图(也就是在页面上) +%matplotlib inline + +import matplotlib.pyplot as pyplot + +pyplot.plot([1.6, 2.7]) + +# [] +``` + +![png](https://chrisalbon.com/python/data_visualization/matplotlib_simple_example_6_1.png) + +## MatPlotLib 中的饼图 + +``` +%matplotlib inline +import pandas as pd +import matplotlib.pyplot as plt + +raw_data = {'officer_name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'], + 'jan_arrests': [4, 24, 31, 2, 3], + 'feb_arrests': [25, 94, 57, 62, 70], + 'march_arrests': [5, 43, 23, 23, 51]} +df = pd.DataFrame(raw_data, columns = ['officer_name', 'jan_arrests', 'feb_arrests', 'march_arrests']) +df +``` + +| | officer_name | jan_arrests | feb_arrests | march_arrests | +| --- | --- | --- | --- | --- | +| 0 | Jason | 4 | 25 | 5 | +| 1 | Molly | 24 | 94 | 43 | +| 2 | Tina | 31 | 57 | 23 | +| 3 | Jake | 2 | 62 | 23 | +| 4 | Amy | 3 | 70 | 51 | + +``` +# 创建一列,其中包含每个官员的总逮捕数 +df['total_arrests'] = df['jan_arrests'] + df['feb_arrests'] + df['march_arrests'] +df +``` + +| | officer_name | jan_arrests | feb_arrests | march_arrests | total_arrests | +| --- | --- | --- | --- | --- | --- | +| 0 | Jason | 4 | 25 | 5 | 34 | +| 1 | Molly | 24 | 94 | 43 | 161 | +| 2 | Tina | 31 | 57 | 23 | 111 | +| 3 | Jake | 2 | 62 | 23 | 87 | +| 4 | Amy | 3 | 70 | 51 | 124 | + +``` +# (从 iWantHue)创建一列颜色 +colors = ["#E13F29", "#D69A80", "#D63B59", "#AE5552", "#CB5C3B", "#EB8076", "#96624E"] + +# 创建饼图 +plt.pie( + # 使用数据 total_arrests + df['total_arrests'], + # 标签为官员名称 + labels=df['officer_name'], + # 没有阴影 + shadow=False, + # 颜色 + colors=colors, + # 将一块扇形移出去 + explode=(0, 0, 0, 0, 0.15), + # 起始角度为 90 度 + startangle=90, + # 将百分比列为分数 + autopct='%1.1f%%', + ) + +# 使饼状图为正圆 +plt.axis('equal') + +# 查看绘图 +plt.tight_layout() +plt.show() +``` + +![png](https://chrisalbon.com/python/data_visualization/matplotlib_pie_chart_7_0.png) + +## MatPlotLib 中的散点图 + +``` +%matplotlib inline +import pandas as pd +import matplotlib.pyplot as plt +import numpy as np + +# 展示 ipython 的最大行数 +pd.set_option('display.max_row', 1000) + +# 将 ipython 的最大列宽设为 50 +pd.set_option('display.max_columns', 50) + +df = pd.read_csv('https://raw.githubusercontent.com/chrisalbon/war_of_the_five_kings_dataset/master/5kings_battles_v1.csv') +df.head() +``` + +| | name | year | battle_number | attacker_king | defender_king | attacker_1 | attacker_2 | attacker_3 | attacker_4 | defender_1 | defender_2 | defender_3 | defender_4 | attacker_outcome | battle_type | major_death | major_capture | attacker_size | defender_size | attacker_commander | defender_commander | summer | location | region | note | +| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | +| 0 | Battle of the Golden Tooth | 298 | 1 | Joffrey/Tommen Baratheon | Robb Stark | Lannister | NaN | NaN | NaN | Tully | NaN | NaN | NaN | win | pitched battle | 1.0 | 0.0 | 15000.0 | 4000.0 | Jaime Lannister | Clement Piper, Vance | 1.0 | Golden Tooth | The Westerlands | NaN | +| 1 | Battle at the Mummer's Ford | 298 | 2 | Joffrey/Tommen Baratheon | Robb Stark | Lannister | NaN | NaN | NaN | Baratheon | NaN | NaN | NaN | win | ambush | 1.0 | 0.0 | NaN | 120.0 | Gregor Clegane | Beric Dondarrion | 1.0 | Mummer's Ford | The Riverlands | NaN | +| 2 | Battle of Riverrun | 298 | 3 | Joffrey/Tommen Baratheon | Robb Stark | Lannister | NaN | NaN | NaN | Tully | NaN | NaN | NaN | win | pitched battle | 0.0 | 1.0 | 15000.0 | 10000.0 | Jaime Lannister, Andros Brax | Edmure Tully, Tytos Blackwood | 1.0 | Riverrun | The Riverlands | NaN | +| 3 | Battle of the Green Fork | 298 | 4 | Robb Stark | Joffrey/Tommen Baratheon | Stark | NaN | NaN | NaN | Lannister | NaN | NaN | NaN | loss | pitched battle | 1.0 | 1.0 | 18000.0 | 20000.0 | Roose Bolton, Wylis Manderly, Medger Cerwyn, H... | Tywin Lannister, Gregor Clegane, Kevan Lannist... | 1.0 | Green Fork | The Riverlands | NaN | +| 4 | Battle of the Whispering Wood | 298 | 5 | Robb Stark | Joffrey/Tommen Baratheon | Stark | Tully | NaN | NaN | Lannister | NaN | NaN | NaN | win | ambush | 1.0 | 1.0 | 1875.0 | 6000.0 | Robb Stark, Brynden Tully | Jaime Lannister | 1.0 | Whispering Wood | The Riverlands | NaN | + +``` +# 创建图形 +plt.figure(figsize=(10,8)) + +# 创建散点图 + # 298 年的攻击方大小为 x 轴 +plt.scatter(df['attacker_size'][df['year'] == 298], + # 298 年的防守方大小为 y 轴 + df['defender_size'][df['year'] == 298], + # 标记 + marker='x', + # 颜色 + color='b', + # 透明度 + alpha=0.7, + # 大小 + s = 124, + # 标签 + label='Year 298') + + # 299 年的攻击方大小为 x 轴 +plt.scatter(df['attacker_size'][df['year'] == 299], + # 299 年的防守方大小为 y 轴 + df['defender_size'][df['year'] == 299], + # 标记 + marker='o', + # 颜色 + color='r', + # 透明度 + alpha=0.7, + # 大小 + s = 124, + # 标签 + label='Year 299') + + # 300 年的攻击方大小为 x 轴 +plt.scatter(df['attacker_size'][df['year'] == 300], + # 300 年的防守方大小为 x 轴 + df['defender_size'][df['year'] == 300], + # 标记 + marker='^', + # 颜色 + color='g', + # 透明度 + alpha=0.7, + # 大小 + s = 124, + # 标签 + label='Year 300') + +# 标题 +plt.title('Battles Of The War Of The Five Kings') + +# y 标签 +plt.ylabel('Defender Size') + +# x 标签 +plt.xlabel('Attacker Size') + +# 图例 +plt.legend(loc='upper right') + +# 设置图形边界 +plt.xlim([min(df['attacker_size'])-1000, max(df['attacker_size'])+1000]) +plt.ylim([min(df['defender_size'])-1000, max(df['defender_size'])+1000]) + +plt.show() +``` + +![png](https://chrisalbon.com/python/data_visualization/matplotlib_simple_scatterplot_6_0.png) + +## MatPlotLib 中的栈式百分比条形图 + +``` +%matplotlib inline +import pandas as pd +import matplotlib.pyplot as plt + +raw_data = {'first_name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'], + 'pre_score': [4, 24, 31, 2, 3], + 'mid_score': [25, 94, 57, 62, 70], + 'post_score': [5, 43, 23, 23, 51]} +df = pd.DataFrame(raw_data, columns = ['first_name', 'pre_score', 'mid_score', 'post_score']) +df +``` + +| | first_name | pre_score | mid_score | post_score | +| --- | --- | --- | --- | --- | +| 0 | Jason | 4 | 25 | 5 | +| 1 | Molly | 24 | 94 | 43 | +| 2 | Tina | 31 | 57 | 23 | +| 3 | Jake | 2 | 62 | 23 | +| 4 | Amy | 3 | 70 | 51 | + +``` +# 创建带有一个子图的图形 +f, ax = plt.subplots(1, figsize=(10,5)) + +# 将条宽设为 1 +bar_width = 1 + +# 条形左边界的位置 +bar_l = [i for i in range(len(df['pre_score']))] + +# x 轴刻度的位置(条形的中心是条形标签) +tick_pos = [i+(bar_width/2) for i in bar_l] + +# 创建每个参与者的总得分 +totals = [i+j+k for i,j,k in zip(df['pre_score'], df['mid_score'], df['post_score'])] + +# 创建每个参与者的 pre_score 和总得分的百分比 +pre_rel = [i / j * 100 for i,j in zip(df['pre_score'], totals)] + +# 创建每个参与者的 mid_score 和总得分的百分比 +mid_rel = [i / j * 100 for i,j in zip(df['mid_score'], totals)] + +# 创建每个参与者的 post_score 和总得分的百分比 +post_rel = [i / j * 100 for i,j in zip(df['post_score'], totals)] + +# 在位置 bar_1 创建条形图 +ax.bar(bar_l, + # 使用数据 pre_rel + pre_rel, + # 标签 + label='Pre Score', + # 透明度 + alpha=0.9, + # 颜色 + color='#019600', + # 条形宽度 + width=bar_width, + # 边框颜色 + edgecolor='white' + ) + +# 在位置 bar_1 创建条形图 +ax.bar(bar_l, + # 使用数据 mid_rel + mid_rel, + # 底部为 pre_rel + bottom=pre_rel, + # 标签 + label='Mid Score', + # 透明度 + alpha=0.9, + # 颜色 + color='#3C5F5A', + # 条形宽度 + width=bar_width, + # 边框颜色 + edgecolor='white' + ) + +# Create a bar chart in position bar_1 +ax.bar(bar_l, + # 使用数据 post_rel + post_rel, + # 底部为 pre_rel 和 mid_rel + bottom=[i+j for i,j in zip(pre_rel, mid_rel)], + # 标签 + label='Post Score', + # 透明度 + alpha=0.9, + # 颜色 + color='#219AD8', + # 条形宽度 + width=bar_width, + # 边框颜色 + edgecolor='white' + ) + +# 将刻度设为 first_name +plt.xticks(tick_pos, df['first_name']) +ax.set_ylabel("Percentage") +ax.set_xlabel("") + +# 设置图形边界 +plt.xlim([min(tick_pos)-bar_width, max(tick_pos)+bar_width]) +plt.ylim(-10, 110) + +# 旋转轴标签 +plt.setp(plt.gca().get_xticklabels(), rotation=45, horizontalalignment='right') + +# 展示绘图 +plt.show() +``` + +![png](https://chrisalbon.com/python/data_visualization/matplotlib_percentage_stacked_bar_plot_6_0.png) -- GitLab