카테고리 없음

Data Visualization - Seaborn

cyberman 2020. 9. 23. 17:12
0923_Seaborn

Data Visualization with Seaborn

In [1]:
# Draw four different charts with sine

import numpy as np
from matplotlib import pyplot as plt
import seaborn as sb

def sinplot(flip=1):
    x = np.linspace(0,14,100)
    for i in range(1,5):
        plt.plot(x, np.sin(x+i*.5)*(7-i)*flip)

sb.set() #set Seaborn defaults
sinplot()
plt.show()
In [2]:
# Set Style(darkgrid, whitegrid, dark, white, ticks)
sb.set_style('ticks')
sinplot()
plt.show()
In [3]:
sb.set_style('ticks')
sinplot()
sb.despine() #remove spines
plt.show()
In [4]:
# Apply various design elements with parameters
print(sb.axes_style()) #available parameters
sb.set_style("darkgrid", {'axes.axisbelow': False, 'grid.color': 'pink'})
sb.set_context(context='poster') #context
sinplot()
plt.show()
{'axes.facecolor': 'white', 'axes.edgecolor': '.15', 'axes.grid': False, 'axes.axisbelow': True, 'axes.labelcolor': '.15', 'figure.facecolor': 'white', 'grid.color': '.8', 'grid.linestyle': '-', 'text.color': '.15', 'xtick.color': '.15', 'ytick.color': '.15', 'xtick.direction': 'out', 'ytick.direction': 'out', 'lines.solid_capstyle': 'round', 'patch.edgecolor': 'w', 'image.cmap': 'rocket', 'font.family': ['sans-serif'], 'font.sans-serif': ['Arial', 'DejaVu Sans', 'Liberation Sans', 'Bitstream Vera Sans', 'sans-serif'], 'patch.force_edgecolor': True, 'xtick.bottom': True, 'xtick.top': False, 'ytick.left': True, 'ytick.right': False, 'axes.spines.left': True, 'axes.spines.bottom': True, 'axes.spines.right': True, 'axes.spines.top': True}
In [5]:
# Color Palettes
current_palette = sb.color_palette() #default palette
sb.palplot(current_palette)
sb.palplot(sb.color_palette("GnBu", 10))
sb.palplot(sb.color_palette("BrBG", 10))
sb.palplot(sb.color_palette("Set2"))
sb.palplot(sb.color_palette("Set3"))
sb.palplot(sb.color_palette("Paired"))

Possible palette values

Accent, Accent_r, Blues, Blues_r, BrBG, BrBG_r, BuGn, BuGn_r, BuPu, BuPu_r, CMRmap, CMRmap_r, Dark2, Dark2_r, GnBu, GnBu_r, Greens, Greens_r, Greys, Greys_r, OrRd, OrRd_r, Oranges, Oranges_r, PRGn, PRGn_r, Paired, Paired_r, Pastel1, Pastel1_r, Pastel2, Pastel2_r, PiYG, PiYG_r, PuBu, PuBuGn, PuBuGn_r, PuBu_r, PuOr, PuOr_r, PuRd, PuRd_r, Purples, Purples_r, RdBu, RdBu_r, RdGy, RdGy_r, RdPu, RdPu_r, RdYlBu, RdYlBu_r, RdYlGn, RdYlGn_r, Reds, Reds_r, Set1, Set1_r, Set2, Set2_r, Set3, Set3_r, Spectral, Spectral_r, Wistia, Wistia_r, YlGn, YlGnBu, YlGnBu_r, YlGn_r, YlOrBr, YlOrBr_r, YlOrRd, YlOrRd_r, afmhot, fmhot_r, autumn, autumn_r, binary, binary_r, bone, bone_r, brg, brg_r, bwr, bwr_r, cividis, cividis_r, cool, cool_r, coolwarm, coolwarm_r, copper, copper_r, cubehelix, cubehelix_r, flag, flag_r, gist_earth, gist_earth_r, gist_gray, gist_gray_r, gist_heat, gist_heat_r, gist_ncar, gist_ncar_r, gist_rainbow, gist_rainbow_r, gist_stern, gist_stern_r, gist_yarg, gist_yarg_r, gnuplot, gnuplot2, gnuplot2_r, gnuplot_r, gray, gray_r, hot, hot_r, hsv, hsv_r, icefire, icefire_r, inferno, inferno_r, jet, jet_r, magma, magma_r, mako, mako_r, nipy_spectral, nipy_spectral_r, ocean, ocean_r, pink, pink_r, plasma, plasma_r, prism, prism_r, rainbow, rainbow_r, rocket, rocket_r, seismic, seismic_r, spring, spring_r, summer, summer_r, tab10, tab10_r, tab20, tab20_r, tab20b, tab20b_r, tab20c, tab20c_r, terrain, terrain_r, twilight, twilight_r, twilight_shifted, twilight_shifted_r, viridis, viridis_r, vlag, vlag_r, winter, winter_r

In [6]:
sb.set_style("darkgrid")
sb.set_context(context='notebook')
sb.set_palette("tab20b_r") #set palette
sinplot()
plt.show()

Charts with Distributed Data

In [10]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sb

print(sb.get_dataset_names()) #all available datasets
df = sb.load_dataset('tips')
df
['anagrams', 'anscombe', 'attention', 'brain_networks', 'car_crashes', 'diamonds', 'dots', 'exercise', 'flights', 'fmri', 'gammas', 'geyser', 'iris', 'mpg', 'penguins', 'planets', 'tips', 'titanic']
D:\anaconda3\lib\site-packages\seaborn\utils.py:384: UserWarning: No parser was explicitly specified, so I'm using the best available HTML parser for this system ("lxml"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.

The code that caused this warning is on line 384 of the file D:\anaconda3\lib\site-packages\seaborn\utils.py. To get rid of this warning, pass the additional argument 'features="lxml"' to the BeautifulSoup constructor.

  gh_list = BeautifulSoup(http)
Out[10]:
total_bill tip sex smoker day time size
0 16.99 1.01 Female No Sun Dinner 2
1 10.34 1.66 Male No Sun Dinner 3
2 21.01 3.50 Male No Sun Dinner 3
3 23.68 3.31 Male No Sun Dinner 2
4 24.59 3.61 Female No Sun Dinner 4
... ... ... ... ... ... ... ...
239 29.03 5.92 Male No Sat Dinner 3
240 27.18 2.00 Female Yes Sat Dinner 2
241 22.67 2.00 Male Yes Sat Dinner 2
242 17.82 1.75 Male No Sat Dinner 2
243 18.78 3.00 Female No Thur Dinner 2

244 rows × 7 columns

In [11]:
df.sort_values(by=['tip'], inplace=True, ascending=False)
df
Out[11]:
total_bill tip sex smoker day time size
170 50.81 10.00 Male Yes Sat Dinner 3
212 48.33 9.00 Male No Sat Dinner 4
23 39.42 7.58 Male No Sat Dinner 4
59 48.27 6.73 Male No Sat Dinner 4
141 34.30 6.70 Male No Thur Lunch 6
... ... ... ... ... ... ... ...
0 16.99 1.01 Female No Sun Dinner 2
236 12.60 1.00 Male Yes Sat Dinner 2
111 7.25 1.00 Female No Sat Dinner 1
67 3.07 1.00 Female Yes Sat Dinner 1
92 5.75 1.00 Female Yes Fri Dinner 2

244 rows × 7 columns

In [12]:
# Histogram & Kernel Density Estimates
df = sb.load_dataset('iris')
sb.set_style("white")
sb.distplot(df['sepal_length'])
plt.show()
In [13]:
# Without Histogram
df = sb.load_dataset('iris')
sb.distplot(df['sepal_length'],hist=False)
plt.show()
In [14]:
# Without KDE
df = sb.load_dataset('iris')
sb.set_style("white")
sb.distplot(df['sepal_length'], kde=False)
plt.show()
In [15]:
# Plotting Bivariate Distribution
df = sb.load_dataset('iris')
sb.jointplot(x='petal_length',y='petal_width',data=df)
plt.show()
In [16]:
# Hexagonal Binning Plot
df = sb.load_dataset('iris')
sb.jointplot(x='sepal_length',y='sepal_width',data=df,kind='hex')
plt.show()
In [17]:
# Kernel Density Estimation
df = sb.load_dataset('iris')
sb.set_palette("Blues")
sb.jointplot(x='petal_length',y='petal_width',data=df,kind='kde')
plt.show()
In [18]:
# Pair Plot: Visualizing Pairwise Relationship
df = sb.load_dataset('iris')
sb.set_style("ticks")
sb.pairplot(df,hue='species',kind="scatter",palette="autumn_r")
plt.show()
# KDE charts are drawn in the diagonal area(default)
In [19]:
# Draw histogram instead of KDE
df = sb.load_dataset('iris')
sb.set_style("ticks")
sb.pairplot(df,hue='species',diag_kind="hist",kind="scatter",palette="autumn_r")
plt.show()
In [20]:
# Plotting Categorical Data
# Categorical Scatter Plots
df = sb.load_dataset('iris')
sb.set_palette("Set2")
sb.stripplot(x="species", y="petal_length", data=df)
plt.show()
In [21]:
df = sb.load_dataset('iris')
sb.stripplot(x="species", y="petal_length", data=df, jitter=False)
plt.show()
In [22]:
# Plotting Categorical Data
# Swarmplot: to avoid overlapping points
df = sb.load_dataset('iris')
sb.swarmplot(x="species", y="petal_length", data=df)
plt.show()
In [23]:
# Box Plots
df = sb.load_dataset('iris')
sb.boxplot(x="species", y="petal_length", data=df)
plt.show()
In [24]:
# Violin Plots
df = sb.load_dataset('iris')
sb.violinplot(x="species", y="petal_length", data=df)
plt.show()
In [25]:
df = sb.load_dataset('tips')
sb.violinplot(x="day", y="total_bill", data=df)
plt.show()
In [26]:
df = sb.load_dataset('tips')
sb.violinplot(x="day", y="total_bill",hue='sex', data=df)
plt.show()
In [27]:
df = sb.load_dataset('tips')
sb.violinplot(x="day", y="total_bill",hue='sex', data=df, split=True)
plt.show()

Roller Coaster 데이터로 Seaborn Chart 그리기

rollercoaster_type으로 분류. excitement, intensity, nausea, max_speed, ride_time, ride_length, total_air_time, drops, highest_drop_height 등의 데이터

In [28]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sb

# 데이터 가공하기
df = pd.read_csv('rollercoasters.csv')
del df['park_id']
del df['custom_design']
df['rollercoaster_type'] = df['rollercoaster_type'].str.strip()
df = df[(df['rollercoaster_type']=='Wooden Roller Coaster')
   |(df['rollercoaster_type']=='Looping Roller Coaster')
   |(df['rollercoaster_type']=='Corkscrew Roller Coaster')
   |(df['rollercoaster_type']=='Vertical Drop Coaster')
   |(df['rollercoaster_type']=='Mine Train Coaster')]
#df.groupby(df['rollercoaster_type']).count().sort_values(by='excitement', ascending=False)
df.head(5)
Out[28]:
theme rollercoaster_type excitement excitement_rating intensity intensity_rating nausea nausea_rating max_speed avg_speed ride_time ride_length max_pos_gs max_neg_gs max_lateral_gs total_air_time drops highest_drop_height inversions
3 Barony Bridge Wooden Roller Coaster 7.69 High 7.92 Very High 4.75 Medium 47 15 79 2401 3.31 -1.15 1.71 5.73 9 62 -1
6 Haunted Harbour Wooden Roller Coaster 7.76 Very High 7.62 High 4.68 Medium 45 17 63 2077 3.18 -1.28 1.83 3.21 5 62 -1
9 Mystic Mountain Wooden Roller Coaster 6.33 High 7.24 High 4.56 Medium 41 9 74 1378 2.61 -0.32 2.15 0.33 2 49 -1
11 Pacific Pyramids Looping Roller Coaster 5.76 High 5.44 High 2.49 Low 36 13 50 1335 2.57 -0.91 2.47 0.93 6 39 -1
13 Pacific Pyramids Vertical Drop Coaster 3.11 Medium 6.91 High 3.98 Medium 45 13 68 1762 4.20 -1.31 3.32 1.11 4 39 -1
In [29]:
# Histogram & Kernel Density Estimates
sb.set_palette("tab10")
sb.distplot(df['ride_time'])
plt.show()
In [30]:
# Plotting Bivariate Distribution
sb.set_palette("OrRd_r")
sb.jointplot(x='excitement',y='total_air_time',data=df)
plt.show()
In [31]:
# Hexagonal Binning Plot
sb.set_palette("PRGn")
sb.jointplot(x='nausea',y='highest_drop_height',data=df,kind='hex')
plt.show()
In [32]:
# Kernel Density Estimation
sb.set_palette("Blues")
sb.jointplot(x='intensity',y='max_speed',data=df,kind='kde')
plt.show()
In [33]:
# Pair Plot: Visualizing pairwise relationship
sb.set_style("ticks")
sb.pairplot(df[['rollercoaster_type','excitement','max_speed','ride_length','highest_drop_height']],hue='rollercoaster_type',kind="scatter",palette="Paired_r")
plt.show()
In [34]:
# Categorical Scatter Plots
sb.set_palette("Set1")
sb.stripplot(x="rollercoaster_type", y="excitement", data=df)
plt.xticks([0,1,2,3,4],['Wooden','Looping','Corkscrew','Vertical Drop','Mine Train'])
plt.show()
In [35]:
# Swarmplot: Avoiding overlapping points
sb.set_palette("Set2")
sb.swarmplot(x="rollercoaster_type", y="intensity", data=df)
plt.xticks([0,1,2,3,4],['Wooden','Looping','Corkscrew','Vertical Drop','Mine Train'])
plt.show()
In [36]:
# Box Plots
sb.set_palette("Paired_r")
sb.boxplot(x="rollercoaster_type", y="nausea", data=df)
plt.xticks([0,1,2,3,4],['Wooden','Looping','Corkscrew','Vertical Drop','Mine Train'])
plt.show()
In [37]:
# Violin Plots
sb.set_palette("Pastel1")
sb.violinplot(x="rollercoaster_type", y="max_speed",data=df)
plt.xticks([0,1,2,3,4],['Wooden','Looping','Corkscrew','Vertical Drop','Mine Train'])
plt.show()