Engage audience with hierarchical summarized insights
Process
1. organize initial story and insights
2. who is your audience and what are they interested in
3. the high-level insights
4. correlation between the insights
5. how will the insights change the audience's mind or help them make decision
import numpy as np, pandas as pd
from matplotlib import pylab
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.rcParams.update(mpl.rcParamsDefault) # 恢复matplotlib默认样式
import seaborn as sns
import scipy.stats as stats
import statsmodels as sm
from __future__ import division
# plt.style.use('ggplot') #使用ggplot的样式
x=np.linspace(0,10,50)
np.random.seed(10)
# stateful
plt.plot(x,np.sin(x)+x+np.random.randn(50))
plt.plot(x,np.sin(x)+0.5*x+np.random.randn(50))
plt.plot(x,np.sin(x)+2*x+np.random.rand(50))
plt.title("Three curves")
plt.show()
# stateless
fig, ax0=plt.subplots(nrows=1) # Create a figure and a set of subplots.if multiple plots, return an array to ax.
ax0.plot(x,np.sin(x)+x+np.random.randn(50))
ax0.plot(x,np.sin(x)+0.5*x+np.random.randn(50))
ax0.plot(x,np.sin(x)+2*x+np.random.rand(50))
ax0.set_title("Three curves")
plt.show()
## Plot multiple charts on the same figure: stateless only
fig=plt.figure() # crate a new figure
ax0=fig.add_subplot(211) # figure layout: 2x1. We will plot the first chart on [0,0]
ax0.plot(x,np.sin(x)+x+np.random.randn(50))
ax0.plot(x,np.sin(x)+0.5*x+np.random.randn(50))
ax0.plot(x,np.sin(x)+2*x+np.random.rand(50))
ax1=fig.add_subplot(212)
ax1.plot(x,np.sin(x)+x+np.random.randn(50))
ax1.plot(x,np.sin(x)+0.5*x+np.random.randn(50))
ax1.plot(x,np.sin(x)+2*x+np.random.rand(50))
plt.axhline(y=10,color='purple',linestyle='--')
plt.show()
x=np.linspace(0,10,50)
np.random.seed(10)
fig,ax0=plt.subplots(nrows=1)
ax0.plot(x,np.sin(x)+x+np.random.randn(50))
ax0.plot(x,np.sin(x)+0.5*x+np.random.randn(50))
ax0.plot(x,np.sin(x)+2*x+np.random.rand(50))
ax0.set_title("Three curves",fontsize=20)
# 1. adjust gridline type: dotted-line
ax0.grid(color='gray', alpha=0.5, linestyle='dotted') # alpha to adjust grid transparency
# or hide the grid:
ax0.grid(True)
# 2. set x-axis label and y-axis label
ax0.set_xlabel('X')
ax0.set_ylabel('Randomization')
ax0.xaxis.label.set_size(20) # set xlabel size
ax0.yaxis.label.set_size(20) # set xlabel size
# # 3. adjust x-axis and y-axis data range
ax0.set_xticks(np.arange(min(x),max(x)+1,1)) #步长的ticks
plt.show() # used to suppress
## 用for loop to subplot
x=np.linspace(0,10)
np.random.seed(10)
# 生成数据
y1=np.sin(x)+x+np.random.randn(50)
y2=np.sin(x)+0.5*x+np.random.randn(50)
y3=np.sin(x)+2*x+np.random.randn(50)
df=pd.DataFrame({'serie1':y1,'serie2':y2,'serie3':y3})
fig=plt.figure()
fig.subplots_adjust(hspace=0.4)
i=1
for col in df.columns:
plt.subplot(df.shape[1],1,i)
plt.plot(df.loc[:,col])
plt.title(col,y=0.6,loc='right')
i+=1
fig.show()
# Import data using Pandas. csv file is in a Google Drive
file_id='13WIX0uQaA4ROvsfVjqPUwbmyW07XMc9S'
link='https://drive.google.com/uc?export=download&id={FILE_ID}'
csv_url=link.format(FILE_ID=file_id)
df = pd.read_csv(csv_url)
df.head()
# Data preprocessing: rename columns, create age group
df.columns=['cust_id','first_name','last_name','gender','age','region','job','date_join',
'balance']
df['age_group'] = pd.cut(df['age'], bins=[15, 30, 50, float('Inf')], labels=['15-30', '30-50', 'Above 50'])
# Balance vs region: Barchart
dt_region_mean_bal=df.groupby(['region'])['balance'].mean()
dt_region_mean_bal = dt_region_mean_bal.reset_index() # reset index才能保证出来的是一个dataframe,方便画图 不然index就成了region了
dt_region_mean_bal['balance']=np.rint(dt_region_mean_bal['balance']) #round to the nearest integer
fig,ax0=plt.subplots(nrows=1)
ax0.bar(dt_region_mean_bal['region'],dt_region_mean_bal['balance'])
# adjust: xlabel, ylabel,y-axis scale; adjust background color
# add title
ax0.set_title("Balance by region",fontsize=20)
ax0.set_xlabel('Region')
ax0.set_ylabel('Average balance')
ax0.xaxis.label.set_size(20) # set xlabel size
ax0.yaxis.label.set_size(20) # set xlabel size
ax0.set_ylim(top=70000)
ax0.grid(False)
# what about data label?
def autolabel(rects):
"""
Attach a text label above each bar displaying its height
"""
for rect in rects:
height = rect.get_height()
ax0.text(rect.get_x() + rect.get_width()/2., 1.05*height,
'%d' % int(height),
ha='center', va='bottom')
rect1=ax0.bar(dt_region_mean_bal['region'],dt_region_mean_bal['balance'],color='blue')
autolabel(rect1)
plt.show()
# use OOP + pandas plot
fig,ax1=plt.subplots(nrows=1)
df.groupby(['region','gender'])['balance'].mean().unstack().plot(kind='bar',ax=ax1)
ax1.set_ylim(top=70000)
ax1.set_title("Balance by region and gender")
ax1.set_ylabel('Average Balance')
ax1.grid(False)
plt.show()
# two y-axies: example, plot conversions and conversion rate on the same chart.
ts = pd.Series(np.random.randn(1000), index=pd.date_range('1/1/2000', periods=1000))
ts = ts.cumsum()
df = pd.DataFrame(np.random.randn(1000, 4), index=ts.index, columns=list('ABCD'))
df = df.cumsum()
df.plot(secondary_y=['A', 'B'], mark_right=False)
plt.show()
import seaborn as sns
sns.set_style('whitegrid')
#stateful
sns.boxplot(x='species', y='sepal_length', data=df_iris)
plt.xticks(rotation=-45) # adjust xticks
plt.title('Aris species sepal_length boxplox') # add title
plt.show()
# boxplot UK bank client balance by age group, using seaborn and matplotlib
# stateless(OOP)
fig,ax2=plt.subplots(nrows=1)
sns.boxplot(x='age_group',y='balance',data=df,ax=ax2) # connect sns and matplotlib
ax2.grid(False)
ax2.set_title("Balance boxplot sliced by age_group")
plt.show()
# Balance distribution by difference groups
# stateless(OOP)
fig = plt.figure()
ax3 = fig.add_subplot(1,1,1)
sns.distplot(df.loc[df['age_group']=='15-30','balance'],label='15-30', hist=False,ax=ax3)
sns.distplot(df.loc[df['age_group']=='30-50','balance'],label='30-50', hist=False,ax=ax3)
sns.distplot(df.loc[df['age_group']=='Above 50','balance'],label='Above 50', hist=False,ax=ax3)
ax3.grid(False)
ax3.set_title("Balance distribution by age_group")
ax3.legend()
plt.show()
# barplot
df_titanic=sns.load_dataset('titanic')
g=sns.barplot(x="sex", y="survived", hue="class", ci=None, data=df_titanic)
g.set_ylabel('survival rate')
vals=g.get_yticks()
g.set_yticklabels(['{:3.2f}%'.format(x*100) for x in vals]) # if you want to show percentage for yticklabels
plt.show()
# multiple charts and chart overlap
# stateful
fig=plt.figure(figsize=(8,16)) # X: control width, Y: control length
plt.subplot(2,1,1) # or ax0=fig.add_subplot(2,1,1)
sns.boxplot(x='species',y='sepal_length',data=df)
plt.xticks(rotation=-45) # adjust xticks
plt.title('Aris species sepal_length boxplox') # add title
plt.subplot(2,1,2)
sns.distplot(df['sepal_length'])
plt.show()
# g is a sns object
g=sns.factorplot(x='time',
y='pulse',
data=df,
hue='diet', # Color by diet
col='diet', # Separate by diet
kind='box') # Swarmplot
# Rotate x-axis labels
g.set_xticklabels(rotation=-45)
# seaborn fit a polynomial model
x = np.linspace(1,50, num = 100)
dt_poly=pd.DataFrame({'x':x,'y':0.2+0.3*np.power(x,2)})
sns.regplot(x='x', y='y',data=dt_poly,order=2, ci=None, scatter_kws={"s": 80});
plt.show()