Credit to God, my Mother, family and friends.
All errors are my own.
Best,
George John Jordan Thomas Aquinas Hayward, Optimist
Data Scientist
September 30, 2019
import pandas as pd
import numpy as np
import missingno as msno
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import warnings
warnings.filterwarnings("ignore")
from fitter import Fitter
from scipy import stats
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn import linear_model
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, r2_score, median_absolute_error, \
explained_variance_score, confusion_matrix, accuracy_score, precision_score, recall_score
import xgboost as xgb
import statsmodels.formula.api as sm
from statsmodels.api import add_constant
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import FunctionTransformer
import matplotlib as mpl
from mpl_toolkits import mplot3d
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
import hypertools as hyp
#read in data
ds1 = pd.read_csv("ds1.csv")
#peek at at data
ds1.head(3)
#drop "Unamed:0" column and peek at the data again:
ds1 = ds1.drop(ds1.columns[0], axis=1)
ds1.head(3)
#check for NULLS
msno.matrix(ds1, color = (.0,.0,.2))
ds1.describe()
def distribution_visualizer_and_fitter(df, col_str):
#this part makes the histogram:
fig, axs = plt.subplots(figsize=(12,4))
plt.hist(df[col_str], bins=100)
plt.title('Histogram of {}'.format(col_str), fontweight = 'bold')
plt.ylabel('Frequency')
plt.xlabel('Values')
plt.show()
#this part fits the distribution to the histogram:
f = Fitter(df[col_str], distributions=['gamma', 'beta', 'rayleigh', 'norm', 'pareto', 'uniform', \
'logistic', 'expon', 'chi2', 'pearson3'], verbose = False, timeout = 10)
f.fit()
f.summary()
#this part makes the probability plot
fig, axs = plt.subplots(ncols=2, figsize=(12,4))
axs[1].set_axis_off()
stats.probplot(df[col_str], plot=axs[0])
plt.show()
for i in ds1.columns:
distribution_visualizer_and_fitter(ds1, i)
#check correlations
#all credit due to Pedro Marcelino; https://www.kaggle.com/pmarcelino/comprehensive-data-exploration-with-python
corrmat = ds1.corr()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, vmax=.8, square=True)
plt.savefig('ds1_correlation_matrix.png',dpi=300, bbox_inches='tight')
#check the pair plot
#all credit due to Pedro Marcelino; https://www.kaggle.com/pmarcelino/comprehensive-data-exploration-with-python
sns.pairplot(ds1, height = 2.5)
plt.savefig('ds1_pair_plot_lower_res.png',dpi=150, bbox_inches='tight')
plt.show()
#define the dependent and independent relationships
features = ds1[['x1','x2','x3','x5','x6']]
ya = ds1['ya']
yb = ds1['yb']
yc = ds1['yc']
def xgboost_modeler(X, y, cv_folds_int, x_left, x_right, y_bottom, y_top):
X_train_xgb, X_test, y_train_xgb, y_test = train_test_split\
(X, y, test_size = 0.2, random_state = 1)
xgb_model = xgb.XGBRegressor(objective="reg:squarederror", random_state=42)
xgb_model.fit(X_train_xgb,y_train_xgb)
y_pred_xgb = xgb_model.predict(X_test)
#linear regression standard
#print r^2 and plot linear regression
print("R^2:"+str(round(r2_score(y_test, y_pred_xgb),4)))
print("Mean Absolute Error (in original units): "+str(round(mean_absolute_error((y_test),\
(y_pred_xgb)),2)))
print("Median Absolute Error (in original units): "+str(round(median_absolute_error((y_test),\
(y_pred_xgb)),2)))
print("STD of Training Data (in original units): "+str(round(np.std((y_test)),2)))
print("Mean of Training Data (in original units): "+str(round(np.mean((y_test)),2)))
print("Residual Skew: "+str(round(stats.skew((y_pred_xgb)-(y_test)),2)))
print("Residual Kurtosis: "+str(round(stats.kurtosis((y_pred_xgb)-(y_test)),2)))
fig, axs = plt.subplots(ncols=(3), nrows=(1), figsize=(15,4))
plt.subplots_adjust(top = 0.95, bottom=0.01, hspace=0.4, wspace=0.16)
axs[0].scatter((y_test), (y_pred_xgb))
axs[0].set_title('Actuals (x) vs Predicteds (y)',fontweight = 'bold')
axs[0].set_xlim([x_left, x_right])
axs[0].set_ylim([y_bottom, y_top])
axs[1].scatter((y_test), ((y_pred_xgb)-(y_test)))
axs[1].set_title('Actuals (x) vs Residuals [pred. - actual] (y)',fontweight = 'bold')
axs[1].set_xlim([x_left, x_right])
axs[1].set_ylim([y_bottom, y_top])
axs[2].hist(((y_pred_xgb)-(y_test)), bins=100)
axs[2].set_title('Histogram of Residuals [predicted - actual]',fontweight = 'bold')
plt.savefig('xgboost_modeling_',dpi=300, bbox_inches='tight')
plt.show()
print("Cross-Validation Scoring for XGBoost")
print('Mean Mean Absolute Error: {}'.format(-1*round(cross_val_score(xgb_model, X_train_xgb, y_train_xgb, \
cv=cv_folds_int, scoring='neg_mean_absolute_error').mean(),2)))
print('Median Mean Absolute Error: {}'.format(-1*round(np.median(cross_val_score(xgb_model, X_train_xgb,\
y_train_xgb, \
cv=cv_folds_int, scoring='neg_mean_absolute_error')),2)))
print('Mean R^2: {}'.format(round(cross_val_score(xgb_model, X_train_xgb, y_train_xgb, cv=cv_folds_int,\
scoring='r2').mean(),3)))
print('Median R^2: {}'.format(round(np.median(cross_val_score(xgb_model,\
X_train_xgb, y_train_xgb, cv=cv_folds_int, scoring='r2')),3)))
print('Max R^2: {}'.format(round(np.max(cross_val_score(xgb_model,\
X_train_xgb, y_train_xgb, cv=cv_folds_int, scoring='r2')),3)))
print('Min R^2: {}'.format(round(np.min(cross_val_score(xgb_model,\
X_train_xgb, y_train_xgb, cv=cv_folds_int, scoring='r2')),3)))
top_xgb_features = pd.DataFrame(sorted(list(zip(X,xgb_model.feature_importances_))\
,key = lambda x: abs(x[1]),reverse=True)[:10], columns=['Feature', 'XGBoost Importance'])
top_xgb_features
bar_count = range(len(top_xgb_features.Feature))
fig, axs = plt.subplots(ncols=2, figsize=(14,4))
#using a subplot method coupled with an inline parameter to have high resolution
#note: "[::-1]" reverses the column in a pandas dataframe
axs[1].set_axis_off()
axs[0].barh(bar_count, top_xgb_features['XGBoost Importance'][::-1],\
align='center', alpha=1)
axs[0].set_xlabel('Values')
axs[0].set_yticks(bar_count)
axs[0].set_yticklabels(top_xgb_features.Feature[::-1], fontsize=10)
axs[0].set_xlabel('XGBoost Importance')
axs[0].set_title("XGBoost's Feature Importances",fontweight = 'bold')
extent = axs[0].get_window_extent().transformed(fig.dpi_scale_trans.inverted())
fig.savefig('xgb_features_importance_dependent_var_',dpi=300, bbox_inches=extent.expanded(1.5, 1.5))
plt.show()
xgboost_modeler(X=features, y=ya, cv_folds_int=4, x_left=-100, x_right=100, y_bottom=-100, y_top=100)
features_regression = add_constant(features)
regressor_OLS = sm.OLS(endog = ya, exog = features_regression).fit()
regressor_OLS.summary()
xgboost_modeler(X=features, y=yb, cv_folds_int=4, x_left=-5, x_right=5, y_bottom=-5, y_top=5)
features_regression = add_constant(features)
regressor_OLS = sm.OLS(endog = yb, exog = features_regression).fit()
regressor_OLS.summary()
features_regression_x1 = add_constant(ds1.x1)
regressor_OLS = sm.OLS(endog = yb, exog = features_regression_x1).fit()
regressor_OLS.summary()
xgboost_modeler(X=features, y=yc, cv_folds_int=4, x_left=-0.1, x_right=0.1, y_bottom=-0.1, y_top=0.1)
features_regression = add_constant(features)
regressor_OLS = sm.OLS(endog = yc, exog = features_regression).fit()
regressor_OLS.summary()
def backwardElimination(X, y, significance_level_float):
numCols = len(X.columns)
for i in range(numCols):
regressor_OLS = sm.OLS(y, X).fit()
maxVar = max(regressor_OLS.pvalues)
if maxVar > significance_level_float:
for j in range(len(regressor_OLS.pvalues)):
if (regressor_OLS.pvalues[j] == maxVar):
X = X.drop(X.columns[j], axis = 1)
regressor_OLS.summary()
return X
#make the polynomial features
poly = PolynomialFeatures(degree=3, include_bias=False)
features_w_poly = poly.fit_transform(features)
features_w_poly = pd.DataFrame(features_w_poly, columns=poly.get_feature_names(features.columns))
#adding in a constant to prepare for regression (we had bias as False above so we need to add the constant now)
features_poly_regression = add_constant(features_w_poly)
#XGboost polynomial test for ya
xgboost_modeler(X=features_w_poly, y=ya, cv_folds_int=4, x_left=-100, x_right=100, y_bottom=-100, y_top=100)
#backwards elimination regression algo for ya
#note this cell runs the Backwards Elimination, and this cell will takw a while to execute
Features_Poly_Pruned_ya = backwardElimination(features_poly_regression, ya, significance_level_float = 0.00001)
Features_Poly_Pruned_ya = add_constant(Features_Poly_Pruned_ya)
pruned_regressor_OLS = sm.OLS(endog = ya, exog = Features_Poly_Pruned_ya).fit()
pruned_regressor_OLS.summary()
#XGboost polynomial test for yb
xgboost_modeler(X=features_w_poly, y=yb, cv_folds_int=4, x_left=-6, x_right=6, y_bottom=-6, y_top=6)
#backwards elimination regression algo for yb
#note this cell runs the Backwards Elimination, and this cell will takw a while to execute
Features_Poly_Pruned_yb = backwardElimination(features_poly_regression, yb, significance_level_float = 0.00001)
Features_Poly_Pruned_yb = add_constant(Features_Poly_Pruned_yb)
pruned_regressor_OLS = sm.OLS(endog = yb, exog = Features_Poly_Pruned_yb).fit()
pruned_regressor_OLS.summary()
#XGboost polynomial test for yc
xgboost_modeler(X=features_w_poly, y=yc, cv_folds_int=4, x_left=-0.1, x_right=0.1, y_bottom=-0.1, y_top=0.1)
#backwards elimination regression algo for yc
#note this cell runs the Backwards Elimination, and this cell will takw a while to execute
Features_Poly_Pruned_yc = backwardElimination(features_poly_regression, yc, significance_level_float = 0.00001)
Features_Poly_Pruned_yc = add_constant(Features_Poly_Pruned_yc)
pruned_regressor_OLS = sm.OLS(endog = yc, exog = Features_Poly_Pruned_yc).fit()
pruned_regressor_OLS.summary()
def multiple_linear_regression_modeler(X, y, cv_folds_int, x_left, x_right, y_bottom, y_top):
X_train1, X_test, y_train1, y_test = train_test_split\
(X, y, test_size = 0.2, random_state = 1)
model = LinearRegression()
model.fit(X_train1,y_train1)
y_slr_predict = model.predict(X_test)
#linear regression standard
#print r^2 and plot linear regression
print("R^2:"+str(round(r2_score(y_test, y_slr_predict),4)))
print("Mean Absolute Error (in original units): "+str(round(mean_absolute_error((y_test),\
(y_slr_predict)),2)))
print("STD of Training Data (in original units): "+str(round(np.std((y_test)),2)))
print("Mean of Training Data (in original units): "+str(round(np.mean((y_test)),2)))
print("Residual Skew: "+str(round(stats.skew((y_slr_predict)-(y_test)),2)))
print("Residual Kurtosis: "+str(round(stats.kurtosis((y_slr_predict)-(y_test)),2)))
fig, axs = plt.subplots(ncols=(3), nrows=(1), figsize=(15,4))
plt.subplots_adjust(top = 0.95, bottom=0.01, hspace=0.25, wspace=0.16)
axs[0].scatter((y_test), (y_slr_predict))
axs[0].set_title('Actuals (x) vs Predicteds (y)',fontweight = 'bold')
axs[0].set_xlim([x_left, x_right])
axs[0].set_ylim([y_bottom, y_top])
axs[1].scatter((y_test), ((y_slr_predict)-(y_test)))
axs[1].set_title('Actuals (x) vs Residuals [pred. - actual] (y)',fontweight = 'bold')
axs[1].set_xlim([x_left, x_right])
axs[1].set_ylim([y_bottom, y_top])
axs[2].hist(((y_slr_predict)-(y_test)), bins=100)
axs[2].set_title('Histogram of Residuals [predicted - actual]',fontweight = 'bold')
plt.savefig('linear_regression_modeling_',dpi=300, bbox_inches='tight')
plt.show()
print("Cross-Validation Scoring for Standard Linear Regression")
print('Mean Absolute Error: {}'.format(-1*round(cross_val_score(model, X_train1, y_train1, \
cv=cv_folds_int, scoring='neg_mean_absolute_error').mean(),2)))
print('Median Mean Absolute Error: {}'.format(-1*round(np.median(cross_val_score(model, X_train1, y_train1, \
cv=cv_folds_int, scoring='neg_mean_absolute_error')),2)))
print('Mean R^2: {}'.format(round(cross_val_score(model, X_train1, y_train1, cv=cv_folds_int, \
scoring='r2').mean(),2)))
print('Median R^2: {}'.format(round(np.median(cross_val_score(model,\
X_train1, y_train1, cv=cv_folds_int, scoring='r2')),3)))
print('Max R^2: {}'.format(round(np.max(cross_val_score(model,\
X_train1, y_train1, cv=cv_folds_int, scoring='r2')),3)))
print('Min R^2: {}'.format(round(np.min(cross_val_score(model,\
X_train1, y_train1, cv=cv_folds_int, scoring='r2')),3)))
linear_regression_feature_list = []
linear_regression_coef_list = []
for i in X.columns:
linear_regression_feature_list.append(i)
for i in range(len(model.coef_)):
linear_regression_coef_list.append(model.coef_[i])
top_10_linear_regression_features = pd.DataFrame(sorted(list(zip(linear_regression_feature_list,\
linear_regression_coef_list))\
,key = lambda x: abs(x[1]),reverse=True)[:10], columns=['Feature', 'Linear Regression Coefficient'])
top_10_linear_regression_features
print(top_10_linear_regression_features)
print('Intercept: {}'.format(model.intercept_))
def single_linear_regression_modeler(X, single_feature_str, y, cv_folds_int, x_left, x_right, y_bottom, y_top):
X_train1, X_test, y_train1, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)
model = LinearRegression()
model.fit(X_train1,y_train1)
y_slr_predict = model.predict(X_test)
print("R^2:"+str(round(r2_score(y_test, y_slr_predict),4)))
print("Mean Absolute Error (in original units): "+str(round(mean_absolute_error((y_test),\
(y_slr_predict)),2)))
print("STD of Training Data (in original units): "+str(round(np.std((y_test)),2)))
print("Mean of Training Data (in original units): "+str(round(np.mean((y_test)),2)))
print("Residual Skew: "+str(round(stats.skew((y_slr_predict)-(y_test)),2)))
print("Residual Kurtosis: "+str(round(stats.kurtosis((y_slr_predict)-(y_test)),2)))
fig, axs = plt.subplots(ncols=(3), nrows=(1), figsize=(15,4))
plt.subplots_adjust(top = 0.95, bottom=0.01, hspace=0.25, wspace=0.16)
axs[0].scatter((y_test), (y_slr_predict))
axs[0].set_title('Actuals (x) vs Predicteds (y)',fontweight = 'bold')
axs[0].set_xlim([x_left, x_right])
axs[0].set_ylim([y_bottom, y_top])
axs[1].scatter((y_test), ((y_slr_predict)-(y_test)))
axs[1].set_title('Actuals (x) vs Residuals [pred. - actual] (y)',fontweight = 'bold')
axs[1].set_xlim([x_left, x_right])
axs[1].set_ylim([y_bottom, y_top])
axs[2].hist(((y_slr_predict)-(y_test)), bins=100)
axs[2].set_title('Histogram of Residuals [predicted - actual]',fontweight = 'bold')
plt.savefig('linear_regression_modeling_',dpi=300, bbox_inches='tight')
plt.show()
print("Cross-Validation Scoring for Standard Linear Regression")
print('Mean Absolute Error: {}'.format(-1*round(cross_val_score(model, X_train1, y_train1, \
cv=cv_folds_int, scoring='neg_mean_absolute_error').mean(),2)))
print('Median Mean Absolute Error: {}'.format(-1*round(np.median(cross_val_score(model, X_train1, y_train1, \
cv=cv_folds_int, scoring='neg_mean_absolute_error')),2)))
print('Mean R^2: {}'.format(round(cross_val_score(model, X_train1, y_train1, cv=cv_folds_int, \
scoring='r2').mean(),2)))
print('Median R^2: {}'.format(round(np.median(cross_val_score(model,\
X_train1, y_train1, cv=cv_folds_int, scoring='r2')),3)))
print('Max R^2: {}'.format(round(np.max(cross_val_score(model,\
X_train1, y_train1, cv=cv_folds_int, scoring='r2')),3)))
print('Min R^2: {}'.format(round(np.min(cross_val_score(model,\
X_train1, y_train1, cv=cv_folds_int, scoring='r2')),3)))
linear_regression_feature_list = []
linear_regression_coef_list = []
for i in list([single_feature_str]):
linear_regression_feature_list.append(i)
for i in range(len(model.coef_)):
linear_regression_coef_list.append(model.coef_[i])
top_10_linear_regression_features = pd.DataFrame(sorted(list(zip(linear_regression_feature_list,\
linear_regression_coef_list))\
,key = lambda x: abs(x[1]),reverse=True)[:10], columns=['Feature', 'Linear Regression Coefficient'])
top_10_linear_regression_features
print(top_10_linear_regression_features)
print('Intercept: {}'.format(model.intercept_))
multiple_linear_regression_modeler(X=ds1[['x1','x2','x3']],
y=ya, cv_folds_int=4, x_left=-100, x_right=100, y_bottom=-100,y_top=100)
ya=0.251(x1)+6.236(x2)+0.956(x3)+7.58
single_linear_regression_modeler(X=ds1.x1.values.reshape(-1, 1), single_feature_str='x1',\
y=yb, cv_folds_int=4, x_left=-6, x_right=6, y_bottom=-6, y_top=6)
yb=0.253(x1)+0.8422
multiple_linear_regression_modeler(X=ds1,y=yc, cv_folds_int=4, x_left=-0.1, x_right=0.1, y_bottom=-0.1, y_top=0.1)
ds2 = pd.read_csv('ds2.csv')
#drop the extra index column since a Pandas dataframe has one built in
ds2 = ds2.drop(ds2.columns[0], axis=1)
#peek at the data
ds2.head()
#check for NULLS
msno.matrix(ds2, color = (.0,.0,.2))
#let's describe the data
ds2.describe()
#all credit due to: Pedro Marcelino; https://www.kaggle.com/pmarcelino/comprehensive-data-exploration-with-python
#correlation matrix
corrmat = ds2.corr()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, vmax=.8, square=True)
plt.savefig('ds1_correlation_matrix.png',dpi=300, bbox_inches='tight')
#all credit due to: Pedro Marcelino, https://www.kaggle.com/pmarcelino/comprehensive-data-exploration-with-python
sns.pairplot(ds2, size = 2.5)
plt.show()
#lets also check the distributions again
for i in ds2.columns:
distribution_visualizer_and_fitter(ds2, i)
#it's important to scale the data so you can compare apples-to-apples how much each component contributes to variance
scaler = MinMaxScaler()
data_rescaled = scaler.fit_transform(ds2)
#we'll need to set a variance threshold. I choose 95%; some others my choose 99%; it's up to you.
pca = PCA(n_components = 0.95)
pca.fit(data_rescaled)
reduced = pca.transform(data_rescaled)
#all credit here to Bartosz Mikulski, https://www.mikulskibartosz.name/pca-how-to-choose-the-number-of-components/
pca = PCA().fit(data_rescaled)
plt.rcParams["figure.figsize"] = (12,6)
fig, ax = plt.subplots()
xi = np.arange(1, 11, step=1)
y = np.cumsum(pca.explained_variance_ratio_)
plt.ylim(0.0,1.1)
plt.plot(xi, y, marker='o', linestyle='--', color='b')
plt.xlabel('Number of Components')
plt.xticks(np.arange(0, 11, step=1)) #change from 0-based array index to 1-based human-readable label
plt.ylabel('Cumulative variance (%)')
plt.title('The number of components needed to explain variance')
plt.axhline(y=0.95, color='r', linestyle='-')
plt.text(0.5, 0.85, '95% cut-off threshold', color = 'red', fontsize=16)
ax.grid(axis='x')
plt.show()
#there is also a third-party Python plug-in called 'Hypertools' which has a similiar procedure
#it looks to correlation
#the package was written by Andrew Heusser,
#https://hypertools.readthedocs.io/en/latest/auto_examples/plot_describe.html#sphx-glr-auto-examples-plot-describe-py
hyp.describe(ds2)
pca_execution = PCA(n_components=3)
pca_doing_its_work = pca_execution.fit_transform(data_rescaled)
print("original shape: ", data_rescaled.shape)
print("transformed shape:", pca_doing_its_work.shape)
pca_conjecture = pd.DataFrame(pca_doing_its_work, columns=['component_1','component_2', 'component_3'])
pca_conjecture.head()
#all credit here to Jake Vanderplas,
#https://jakevdp.github.io/PythonDataScienceHandbook/04.12-three-dimensional-plotting.html
fig = plt.figure()
ax = plt.axes(projection='3d')
ax.scatter3D(pca_conjecture.component_1, pca_conjecture.component_2, pca_conjecture.component_3)
plt.savefig('pca_3D_graph.png',dpi=300, bbox_inches='tight')
#all credit to Dmitriy Kavyazin, http://bit.ly/2mYDWv7
ks = range(1, 10)
inertias = []
for k in ks:
# Create a KMeans instance with k clusters: model
model = KMeans(n_clusters=k)
# Fit model to samples
model.fit(pca_conjecture) #.iloc[:,:3])
# Append the inertia to the list of inertias
inertias.append(model.inertia_)
plt.plot(ks, inertias, '-o', color='black')
plt.xlabel('number of clusters, k')
plt.ylabel('inertia')
plt.xticks(ks)
plt.show()
#here comes K-Means at long last!
kmeans = KMeans(n_clusters=4)
kmeans.fit(pca_conjecture)
choice_kmeans = kmeans.predict(pca_conjecture)
fig = plt.figure()
ax = plt.axes(projection='3d')
ax.scatter3D(pca_conjecture.component_1, pca_conjecture.component_2, pca_conjecture.component_3,\
c=choice_kmeans, cmap='Dark2')
plt.savefig('kmeans_on_pca.png',dpi=300, bbox_inches='tight')
#just to be safe, we can check the shape so we know that they'll match
print(ds2.shape)
print(choice_kmeans.shape)
ds2['cohort_guess'] = choice_kmeans
ds2.sample(10)
query_3_a_results = pd.read_csv('stackoverflow_query_3-a_results.csv')
query_3_a_results
query_3_b_results = pd.read_csv('stackoverflow_query_3-b_results.csv')
query_3_b_results.head(10)
ax = sns.regplot(x="View_Level", y="Frequency", data=query_3_b_results, fit_reg=False)
ax.set_ylabel('Frequency', fontweight = 'bold')
ax.set_xlim(0,40)
ax.set_xlabel("View Level", fontweight = 'bold')
ax.yaxis.set_major_formatter(mpl.ticker.StrMethodFormatter('{x:,.0f}'))
ax.set_title("Histogram of Views Generated by Above SQL Query", fontweight = 'bold')
plt.show()
plt.figure(figsize=(16,5))
plt.bar(query_3_b_results.View_Level, query_3_b_results.Frequency)
plt.ylim(0,300000)
plt.xlim(0,80)
plt.title("Zoomed-In Snapshot of Stack Overflow Views Distribution", fontweight='bold')
plt.ylabel("Frequency", fontweight = 'bold')
plt.xlabel("View Level", fontweight = 'bold')
plt.show()
query_3_c_results = pd.read_csv('stackoverflow_query_3-c_results.csv')
query_3_c_results
query_3_d_bonus_results = pd.read_csv('stackoverflow_query_3-d_results.csv')
query_3_d_bonus_results