Engage audience with hierarchical summarized insights
Process
1. organize initial story and insights
2. who is your audience and what are they interested in
3. the high-level insights
4. correlation between the insights
5. how will the insights change the audience's mind or help them make decision
import numpy as np, pandas as pdfrom matplotlib import pylabimport matplotlib.pyplot as pltimport matplotlib as mplmpl.rcParams.update(mpl.rcParamsDefault)# 恢复matplotlib默认样式import seaborn as snsimport scipy.stats as statsimport statsmodels as smfrom__future__import division# plt.style.use('ggplot') #使用ggplot的样式
x=np.linspace(0,10,50)np.random.seed(10)# statefulplt.plot(x,np.sin(x)+x+np.random.randn(50))plt.plot(x,np.sin(x)+0.5*x+np.random.randn(50))plt.plot(x,np.sin(x)+2*x+np.random.rand(50))plt.title("Three curves")plt.show()# statelessfig, ax0=plt.subplots(nrows=1)# Create a figure and a set of subplots.if multiple plots, return an array to ax. ax0.plot(x,np.sin(x)+x+np.random.randn(50))ax0.plot(x,np.sin(x)+0.5*x+np.random.randn(50))ax0.plot(x,np.sin(x)+2*x+np.random.rand(50))ax0.set_title("Three curves")plt.show()## Plot multiple charts on the same figure: stateless onlyfig=plt.figure()# crate a new figureax0=fig.add_subplot(211)# figure layout: 2x1. We will plot the first chart on [0,0]ax0.plot(x,np.sin(x)+x+np.random.randn(50))ax0.plot(x,np.sin(x)+0.5*x+np.random.randn(50))ax0.plot(x,np.sin(x)+2*x+np.random.rand(50))ax1=fig.add_subplot(212)ax1.plot(x,np.sin(x)+x+np.random.randn(50))ax1.plot(x,np.sin(x)+0.5*x+np.random.randn(50))ax1.plot(x,np.sin(x)+2*x+np.random.rand(50))plt.axhline(y=10,color='purple',linestyle='--')plt.show()
更灵活的做法stateless里,还可以
调节coordinate objects
adjust background color
adjust gridline
set x-axis label and y-axis label
adjust x-axis and y-axis ticks
x=np.linspace(0,10,50)np.random.seed(10)fig,ax0=plt.subplots(nrows=1)ax0.plot(x,np.sin(x)+x+np.random.randn(50))ax0.plot(x,np.sin(x)+0.5*x+np.random.randn(50))ax0.plot(x,np.sin(x)+2*x+np.random.rand(50))ax0.set_title("Three curves",fontsize=20)# 1. adjust gridline type: dotted-lineax0.grid(color='gray', alpha=0.5, linestyle='dotted')# alpha to adjust grid transparency# or hide the grid: ax0.grid(True)# 2. set x-axis label and y-axis labelax0.set_xlabel('X')ax0.set_ylabel('Randomization')ax0.xaxis.label.set_size(20)# set xlabel sizeax0.yaxis.label.set_size(20)# set xlabel size# # 3. adjust x-axis and y-axis data rangeax0.set_xticks(np.arange(min(x),max(x)+1,1))#步长的ticksplt.show()# used to suppress
用for loop来subplot
## 用for loop to subplotx=np.linspace(0,10)np.random.seed(10)# 生成数据y1=np.sin(x)+x+np.random.randn(50)y2=np.sin(x)+0.5*x+np.random.randn(50)y3=np.sin(x)+2*x+np.random.randn(50)df=pd.DataFrame({'serie1':y1,'serie2':y2,'serie3':y3})fig=plt.figure()fig.subplots_adjust(hspace=0.4)i=1for col in df.columns: plt.subplot(df.shape[1],1,i) plt.plot(df.loc[:,col]) plt.title(col,y=0.6,loc='right') i+=1fig.show()
拿到一个新的数据 如何visualize
# Import data using Pandas. csv file is in a Google Drivefile_id='13WIX0uQaA4ROvsfVjqPUwbmyW07XMc9S'link='https://drive.google.com/uc?export=download&id={FILE_ID}'csv_url=link.format(FILE_ID=file_id)df = pd.read_csv(csv_url)df.head()# Data preprocessing: rename columns, create age groupdf.columns=['cust_id','first_name','last_name','gender','age','region','job','date_join','balance']df['age_group']= pd.cut(df['age'], bins=[15, 30, 50, float('Inf')], labels=['15-30', '30-50', 'Above 50'])# Balance vs region: Barchartdt_region_mean_bal=df.groupby(['region'])['balance'].mean()dt_region_mean_bal = dt_region_mean_bal.reset_index()# reset index才能保证出来的是一个dataframe,方便画图 不然index就成了region了dt_region_mean_bal['balance']=np.rint(dt_region_mean_bal['balance'])#round to the nearest integerfig,ax0=plt.subplots(nrows=1)ax0.bar(dt_region_mean_bal['region'],dt_region_mean_bal['balance'])# adjust: xlabel, ylabel,y-axis scale; adjust background color# add titleax0.set_title("Balance by region",fontsize=20)ax0.set_xlabel('Region')ax0.set_ylabel('Average balance')ax0.xaxis.label.set_size(20)# set xlabel sizeax0.yaxis.label.set_size(20)# set xlabel sizeax0.set_ylim(top=70000)ax0.grid(False)# what about data label?defautolabel(rects):""" Attach a text label above each bar displaying its height """for rect in rects: height = rect.get_height() ax0.text(rect.get_x() + rect.get_width()/2., 1.05*height,'%d'%int(height), ha='center', va='bottom')rect1=ax0.bar(dt_region_mean_bal['region'],dt_region_mean_bal['balance'],color='blue')autolabel(rect1)plt.show()
# use OOP + pandas plotfig,ax1=plt.subplots(nrows=1)df.groupby(['region','gender'])['balance'].mean().unstack().plot(kind='bar',ax=ax1)ax1.set_ylim(top=70000)ax1.set_title("Balance by region and gender")ax1.set_ylabel('Average Balance')ax1.grid(False)plt.show()
双y轴
# two y-axies: example, plot conversions and conversion rate on the same chart.ts = pd.Series(np.random.randn(1000), index=pd.date_range('1/1/2000', periods=1000))ts = ts.cumsum()df = pd.DataFrame(np.random.randn(1000, 4), index=ts.index, columns=list('ABCD'))df = df.cumsum()df.plot(secondary_y=['A', 'B'], mark_right=False)plt.show()
Seaborn
用seaborn作图
用matplotlib微调
加theme
语法上和R的ggplot很像
对比stateful vs stateless
import seaborn as snssns.set_style('whitegrid')#statefulsns.boxplot(x='species', y='sepal_length', data=df_iris)plt.xticks(rotation=-45)# adjust xticksplt.title('Aris species sepal_length boxplox')# add titleplt.show()
# boxplot UK bank client balance by age group, using seaborn and matplotlib# stateless(OOP)fig,ax2=plt.subplots(nrows=1)sns.boxplot(x='age_group',y='balance',data=df,ax=ax2)# connect sns and matplotlibax2.grid(False)ax2.set_title("Balance boxplot sliced by age_group")plt.show()
# Balance distribution by difference groups# stateless(OOP)fig = plt.figure()ax3 = fig.add_subplot(1,1,1)sns.distplot(df.loc[df['age_group']=='15-30','balance'],label='15-30', hist=False,ax=ax3)sns.distplot(df.loc[df['age_group']=='30-50','balance'],label='30-50', hist=False,ax=ax3)sns.distplot(df.loc[df['age_group']=='Above 50','balance'],label='Above 50', hist=False,ax=ax3)ax3.grid(False)ax3.set_title("Balance distribution by age_group")ax3.legend()plt.show()
unstack
sns里面自己内置了很多groupby似的操作 帮着slice了数据 方便哭了
# barplotdf_titanic=sns.load_dataset('titanic')g=sns.barplot(x="sex", y="survived", hue="class", ci=None, data=df_titanic)g.set_ylabel('survival rate')vals=g.get_yticks()g.set_yticklabels(['{:3.2f}%'.format(x*100) for x in vals])# if you want to show percentage for yticklabelsplt.show()
# multiple charts and chart overlap# statefulfig=plt.figure(figsize=(8,16))# X: control width, Y: control lengthplt.subplot(2,1,1)# or ax0=fig.add_subplot(2,1,1)sns.boxplot(x='species',y='sepal_length',data=df)plt.xticks(rotation=-45)# adjust xticksplt.title('Aris species sepal_length boxplox')# add titleplt.subplot(2,1,2)sns.distplot(df['sepal_length'])plt.show()
factor plot
id
diet
pulse
time
kind
0
1
low fat
85
1 min
rest
1
1
low fat
85
15 min
rest
2
1
low fat
88
30 min
rest
3
2
low fat
90
1 min
rest
4
2
low fat
92
15 min
rest
# g is a sns objectg=sns.factorplot(x='time', y='pulse', data=df, hue='diet', # Color by diet col='diet', # Separate by diet kind='box')# Swarmplot# Rotate x-axis labelsg.set_xticklabels(rotation=-45)
# 微调, 把图形做得更简洁, 把不喜欢的gridline去掉, 加上titlefig=plt.figure(figsize=(8,4))ax0=fig.add_subplot(121)sns.regplot(x='x',y='y',data=dt_lin,ax=ax0)ax1=fig.add_subplot(122)sns.regplot(x='x',y='y',data=dt_lin,ax=ax1)ax1.grid(False)ax1.set_title('Clean linear regression')plt.show()
# seaborn fit a polynomial modelx = np.linspace(1,50, num =100)dt_poly=pd.DataFrame({'x':x,'y':0.2+0.3*np.power(x,2)})sns.regplot(x='x', y='y',data=dt_poly,order=2, ci=None, scatter_kws={"s": 80});plt.show()