Engage audience with hierarchical summarized insights
Process
1. organize initial story and insights
2. who is your audience and what are they interested in
3. the high-level insights
4. correlation between the insights
5. how will the insights change the audience's mind or help them make decision
可视化的基本原则
story focus, supported by visualization
data dimension in a chart: 2D
attentive attributes: color, highlight
less is more: remove distraction
Tableau
Import Package
import numpy as np, pandas as pdfrom matplotlib import pylabimport matplotlib.pyplot as pltimport matplotlib as mplmpl.rcParams.update(mpl.rcParamsDefault)# 恢复matplotlib默认样式import seaborn as snsimport scipy.stats as statsimport statsmodels as smfrom__future__import division# plt.style.use('ggplot') #使用ggplot的样式
x=np.linspace(0,10,50)np.random.seed(10)# statefulplt.plot(x,np.sin(x)+x+np.random.randn(50))plt.plot(x,np.sin(x)+0.5*x+np.random.randn(50))plt.plot(x,np.sin(x)+2*x+np.random.rand(50))plt.title("Three curves")plt.show()# statelessfig, ax0=plt.subplots(nrows=1)# Create a figure and a set of subplots.if multiple plots, return an array to ax. ax0.plot(x,np.sin(x)+x+np.random.randn(50))ax0.plot(x,np.sin(x)+0.5*x+np.random.randn(50))ax0.plot(x,np.sin(x)+2*x+np.random.rand(50))ax0.set_title("Three curves")plt.show()## Plot multiple charts on the same figure: stateless onlyfig=plt.figure()# crate a new figureax0=fig.add_subplot(211)# figure layout: 2x1. We will plot the first chart on [0,0]ax0.plot(x,np.sin(x)+x+np.random.randn(50))ax0.plot(x,np.sin(x)+0.5*x+np.random.randn(50))ax0.plot(x,np.sin(x)+2*x+np.random.rand(50))ax1=fig.add_subplot(212)ax1.plot(x,np.sin(x)+x+np.random.randn(50))ax1.plot(x,np.sin(x)+0.5*x+np.random.randn(50))ax1.plot(x,np.sin(x)+2*x+np.random.rand(50))plt.axhline(y=10,color='purple',linestyle='--')plt.show()
更灵活的做法stateless里,还可以
调节coordinate objects
adjust background color
adjust gridline
set x-axis label and y-axis label
adjust x-axis and y-axis ticks
x=np.linspace(0,10,50)np.random.seed(10)fig,ax0=plt.subplots(nrows=1)ax0.plot(x,np.sin(x)+x+np.random.randn(50))ax0.plot(x,np.sin(x)+0.5*x+np.random.randn(50))ax0.plot(x,np.sin(x)+2*x+np.random.rand(50))ax0.set_title("Three curves",fontsize=20)# 1. adjust gridline type: dotted-lineax0.grid(color='gray', alpha=0.5, linestyle='dotted')# alpha to adjust grid transparency# or hide the grid: ax0.grid(True)# 2. set x-axis label and y-axis labelax0.set_xlabel('X')ax0.set_ylabel('Randomization')ax0.xaxis.label.set_size(20)# set xlabel sizeax0.yaxis.label.set_size(20)# set xlabel size# # 3. adjust x-axis and y-axis data rangeax0.set_xticks(np.arange(min(x),max(x)+1,1))#步长的ticksplt.show()# used to suppress
用for loop来subplot
## 用for loop to subplotx=np.linspace(0,10)np.random.seed(10)# 生成数据y1=np.sin(x)+x+np.random.randn(50)y2=np.sin(x)+0.5*x+np.random.randn(50)y3=np.sin(x)+2*x+np.random.randn(50)df=pd.DataFrame({'serie1':y1,'serie2':y2,'serie3':y3})fig=plt.figure()fig.subplots_adjust(hspace=0.4)i=1for col in df.columns: plt.subplot(df.shape[1],1,i) plt.plot(df.loc[:,col]) plt.title(col,y=0.6,loc='right') i+=1fig.show()
拿到一个新的数据 如何visualize
# Import data using Pandas. csv file is in a Google Drivefile_id='13WIX0uQaA4ROvsfVjqPUwbmyW07XMc9S'link='https://drive.google.com/uc?export=download&id={FILE_ID}'csv_url=link.format(FILE_ID=file_id)df = pd.read_csv(csv_url)df.head()# Data preprocessing: rename columns, create age groupdf.columns=['cust_id','first_name','last_name','gender','age','region','job','date_join','balance']df['age_group']= pd.cut(df['age'], bins=[15, 30, 50, float('Inf')], labels=['15-30', '30-50', 'Above 50'])# Balance vs region: Barchartdt_region_mean_bal=df.groupby(['region'])['balance'].mean()dt_region_mean_bal = dt_region_mean_bal.reset_index()# reset index才能保证出来的是一个dataframe,方便画图 不然index就成了region了dt_region_mean_bal['balance']=np.rint(dt_region_mean_bal['balance'])#round to the nearest integerfig,ax0=plt.subplots(nrows=1)ax0.bar(dt_region_mean_bal['region'],dt_region_mean_bal['balance'])# adjust: xlabel, ylabel,y-axis scale; adjust background color# add titleax0.set_title("Balance by region",fontsize=20)ax0.set_xlabel('Region')ax0.set_ylabel('Average balance')ax0.xaxis.label.set_size(20)# set xlabel sizeax0.yaxis.label.set_size(20)# set xlabel sizeax0.set_ylim(top=70000)ax0.grid(False)# what about data label?defautolabel(rects):""" Attach a text label above each bar displaying its height """for rect in rects: height = rect.get_height() ax0.text(rect.get_x() + rect.get_width()/2., 1.05*height,'%d'%int(height), ha='center', va='bottom')rect1=ax0.bar(dt_region_mean_bal['region'],dt_region_mean_bal['balance'],color='blue')autolabel(rect1)plt.show()
# use OOP + pandas plotfig,ax1=plt.subplots(nrows=1)df.groupby(['region','gender'])['balance'].mean().unstack().plot(kind='bar',ax=ax1)ax1.set_ylim(top=70000)ax1.set_title("Balance by region and gender")ax1.set_ylabel('Average Balance')ax1.grid(False)plt.show()
双y轴
# two y-axies: example, plot conversions and conversion rate on the same chart.ts = pd.Series(np.random.randn(1000), index=pd.date_range('1/1/2000', periods=1000))ts = ts.cumsum()df = pd.DataFrame(np.random.randn(1000, 4), index=ts.index, columns=list('ABCD'))df = df.cumsum()df.plot(secondary_y=['A', 'B'], mark_right=False)plt.show()
Seaborn
用seaborn作图
用matplotlib微调
加theme
语法上和R的ggplot很像
对比stateful vs stateless
import seaborn as snssns.set_style('whitegrid')#statefulsns.boxplot(x='species', y='sepal_length', data=df_iris)plt.xticks(rotation=-45)# adjust xticksplt.title('Aris species sepal_length boxplox')# add titleplt.show()
# boxplot UK bank client balance by age group, using seaborn and matplotlib# stateless(OOP)fig,ax2=plt.subplots(nrows=1)sns.boxplot(x='age_group',y='balance',data=df,ax=ax2)# connect sns and matplotlibax2.grid(False)ax2.set_title("Balance boxplot sliced by age_group")plt.show()
# Balance distribution by difference groups# stateless(OOP)fig = plt.figure()ax3 = fig.add_subplot(1,1,1)sns.distplot(df.loc[df['age_group']=='15-30','balance'],label='15-30', hist=False,ax=ax3)sns.distplot(df.loc[df['age_group']=='30-50','balance'],label='30-50', hist=False,ax=ax3)sns.distplot(df.loc[df['age_group']=='Above 50','balance'],label='Above 50', hist=False,ax=ax3)ax3.grid(False)ax3.set_title("Balance distribution by age_group")ax3.legend()plt.show()
unstack
sns里面自己内置了很多groupby似的操作 帮着slice了数据 方便哭了
# barplotdf_titanic=sns.load_dataset('titanic')g=sns.barplot(x="sex", y="survived", hue="class", ci=None, data=df_titanic)g.set_ylabel('survival rate')vals=g.get_yticks()g.set_yticklabels(['{:3.2f}%'.format(x*100) for x in vals])# if you want to show percentage for yticklabelsplt.show()
# multiple charts and chart overlap# statefulfig=plt.figure(figsize=(8,16))# X: control width, Y: control lengthplt.subplot(2,1,1)# or ax0=fig.add_subplot(2,1,1)sns.boxplot(x='species',y='sepal_length',data=df)plt.xticks(rotation=-45)# adjust xticksplt.title('Aris species sepal_length boxplox')# add titleplt.subplot(2,1,2)sns.distplot(df['sepal_length'])plt.show()
factor plot
# g is a sns objectg=sns.factorplot(x='time', y='pulse', data=df, hue='diet', # Color by diet col='diet', # Separate by diet kind='box')# Swarmplot# Rotate x-axis labelsg.set_xticklabels(rotation=-45)
# 微调, 把图形做得更简洁, 把不喜欢的gridline去掉, 加上titlefig=plt.figure(figsize=(8,4))ax0=fig.add_subplot(121)sns.regplot(x='x',y='y',data=dt_lin,ax=ax0)ax1=fig.add_subplot(122)sns.regplot(x='x',y='y',data=dt_lin,ax=ax1)ax1.grid(False)ax1.set_title('Clean linear regression')plt.show()
# seaborn fit a polynomial modelx = np.linspace(1,50, num =100)dt_poly=pd.DataFrame({'x':x,'y':0.2+0.3*np.power(x,2)})sns.regplot(x='x', y='y',data=dt_poly,order=2, ci=None, scatter_kws={"s": 80});plt.show()