from scipy.stats import norm, levene, ttest_ind, f_oneway, chi2_contingency, pearsonr
import statsmodels.api as sm
from statsmodels.formula.api import ols
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')

teaching_ratings_url = 'https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-ST0151EN-SkillsNetwork/labs/teachingratings.csv'
teaching_ratings_df = pd.read_csv(teaching_ratings_url).loc[:, :"prof"]
teaching_ratings_df.head()

teaching_ratings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 463 entries, 0 to 462
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   minority     463 non-null    object 
 1   age          463 non-null    int64  
 2   gender       463 non-null    object 
 3   credits      463 non-null    object 
 4   beauty       463 non-null    float64
 5   eval         463 non-null    float64
 6   division     463 non-null    object 
 7   native       463 non-null    object 
 8   tenure       463 non-null    object 
 9   students     463 non-null    int64  
 10  allstudents  463 non-null    int64  
 11  prof         463 non-null    int64  
dtypes: float64(2), int64(4), object(6)
memory usage: 43.5+ KB

teaching_ratings_df.describe().iloc[:, :-1]

teaching_ratings_df.beauty.describe()[["mean", "std", "min", "max"]]

mean    6.271140e-08
std     7.886477e-01
min    -1.450494e+00
max     1.970023e+00
Name: beauty, dtype: float64

plt.figure()
sns.histplot(data=teaching_ratings_df, x="beauty", kde=True)
plt.show()

gender_beauty_mean = teaching_ratings_df.groupby('gender')["beauty"].mean()
gender_beauty_mean

gender
female    0.116109
male     -0.084482
Name: beauty, dtype: float64

plt.figure()
sns.histplot(data=teaching_ratings_df, x="beauty", hue="gender", kde=True)
plt.show()

gender_tenure_count = teaching_ratings_df[teaching_ratings_df.tenure == 'yes'].groupby('gender')["tenure"].count()
female = gender_tenure_count["female"]
male = gender_tenure_count["male"]
labels = teaching_ratings_df.gender.unique()
sizes = [female, male]

fig, ax = plt.subplots()
ax.pie(sizes, textprops={'color': "w", 'fontsize': '12'}, autopct=lambda pct: "{:.2f}%\n({:d})".format(pct, round(pct/100 * sum(sizes))))
ax.legend(labels)
plt.title("Tenured professors")
plt.show()

minority_tenure_count = teaching_ratings_df[teaching_ratings_df.tenure == 'yes'].groupby('minority')["tenure"].count()
non_minority = minority_tenure_count["no"]
minority =minority_tenure_count["yes"]
labels = ["non-minority", "minority"]
sizes = [non_minority, minority]

fig, ax = plt.subplots()
ax.pie(sizes, textprops={'color': "w", 'fontsize': '12'}, autopct=lambda pct: "{:.2f}%\n({:d})".format(pct, round(pct/100 * sum(sizes))))
ax.legend(labels)
plt.title("Tenured professors")
plt.show()

tenure_age_mean = teaching_ratings_df.groupby('tenure')["age"].mean()
tenure_age_mean

tenure
no     50.186275
yes    47.850416
Name: age, dtype: float64

plt.figure()
sns.histplot(data=teaching_ratings_df, x="age", hue="tenure", kde=True)
plt.show()

tenure_eval_mean = teaching_ratings_df.groupby('tenure')["eval"].mean()
tenure_eval_mean

tenure
no     4.133333
yes    3.960111
Name: eval, dtype: float64

plt.figure()
sns.histplot(data=teaching_ratings_df, x="eval", hue="tenure", kde=True)
plt.show()

division_eval_mean = teaching_ratings_df.groupby('division')['eval'].mean()
division_eval_mean

division
lower    4.087261
upper    3.952614
Name: eval, dtype: float64

plt.figure()
sns.barplot(x=division_eval_mean.index, y=division_eval_mean)
plt.show()

plt.figure()
sns.boxplot(x='credits', y='beauty', data=teaching_ratings_df)
plt.show()

sns.catplot(x='gender', kind='count', data=teaching_ratings_df)
plt.show()

sns.catplot(x='gender', hue='tenure', kind='count', data=teaching_ratings_df)
plt.show()

sns.catplot(x='gender', hue='tenure', row='division', kind='count', data=teaching_ratings_df, height=3, aspect=2)
plt.show()

gender_eval_mean = teaching_ratings_df.groupby('gender')["eval"].mean()
gender_eval_mean

gender
female    3.901026
male      4.069030
Name: eval, dtype: float64

plt.figure()
sns.histplot(data=teaching_ratings_df, x="eval", hue="gender", kde=True)
plt.show()

plt.figure()
sns.boxplot(x="gender", y="age", data=teaching_ratings_df)
plt.show()

plt.figure()
sns.boxplot(x="tenure", y="age", hue="gender", data=teaching_ratings_df)
plt.show()

native_beauty_mean = teaching_ratings_df.groupby('native')["beauty"].mean()
native_beauty_mean

native
no     0.031962
yes   -0.002057
Name: beauty, dtype: float64

plt.figure()
sns.histplot(data=teaching_ratings_df, x='beauty', hue='native', kde=True)
plt.show()

plt.figure()
sns.boxplot(x='minority', y='age', data=teaching_ratings_df)
plt.show()

sns.catplot(data=teaching_ratings_df, x='tenure', hue='minority', row='gender', kind='count', height=3, aspect=2)
plt.show()

eval_statistics = teaching_ratings_df["eval"].describe()[["mean", "std", "min", "max"]]
eval_statistics

mean    3.998272
std     0.554866
min     2.100000
max     5.000000
Name: eval, dtype: float64

plt.figure()
sns.histplot(teaching_ratings_df["eval"], kde=True)
plt.show()

probability = norm.cdf((4.5 - eval_statistics["mean"]) / eval_statistics["std"])
print("Probability = {:.2f}%".format(100*(1 - probability)))

Probability = 18.29%

probability_1 = norm.cdf((3.5 - eval_statistics["mean"]) / eval_statistics["std"])
probability_2 = norm.cdf((4.2 - eval_statistics["mean"]) / eval_statistics["std"])
print("Probability = {:.2f}%".format(100*(probability_2 - probability_1)))

Probability = 45.73%

teaching_ratings_df.groupby("gender")["eval"].describe()

levene(teaching_ratings_df[teaching_ratings_df['gender'] == 'female']['eval'], 
       teaching_ratings_df[teaching_ratings_df['gender'] == 'male']['eval'], 
       center='mean')

LeveneResult(statistic=0.19032922435292574, pvalue=0.6628469836244741)

ttest_ind(teaching_ratings_df[teaching_ratings_df['gender'] == 'female']['eval'], 
          teaching_ratings_df[teaching_ratings_df['gender'] == 'male']['eval'], 
          equal_var=True)

TtestResult(statistic=-3.249937943510772, pvalue=0.0012387609449522217, df=461.0)

teaching_ratings_df.loc[(teaching_ratings_df['age'] <= 40), 'age_group'] = '40 years and younger'
teaching_ratings_df.loc[(teaching_ratings_df['age'] > 40) & (teaching_ratings_df['age'] < 60), 'age_group'] = 'between 40 and 60 years'
teaching_ratings_df.loc[(teaching_ratings_df['age'] >= 60), 'age_group'] = '60 years and older'

teaching_ratings_df.groupby("age_group")["beauty"].describe()

levene(teaching_ratings_df[teaching_ratings_df['age_group'] == '40 years and younger']['beauty'],
       teaching_ratings_df[teaching_ratings_df['age_group'] == 'between 40 and 60 years']['beauty'],
       teaching_ratings_df[teaching_ratings_df['age_group'] == '60 years and older']['beauty'], 
       center='mean')

LeveneResult(statistic=11.769735544673434, pvalue=1.0350399938234537e-05)

f_oneway(teaching_ratings_df[teaching_ratings_df['age_group'] == '40 years and younger']['beauty'], 
         teaching_ratings_df[teaching_ratings_df['age_group'] == 'between 40 and 60 years']['beauty'], 
         teaching_ratings_df[teaching_ratings_df['age_group'] == '60 years and older']['beauty'])

F_onewayResult(statistic=23.552552376353074, pvalue=1.8271127151948056e-10)

teaching_ratings_df.groupby("age_group")["eval"].describe()

levene(teaching_ratings_df[teaching_ratings_df['age_group'] == '40 years and younger']['eval'],
       teaching_ratings_df[teaching_ratings_df['age_group'] == 'between 40 and 60 years']['eval'],
       teaching_ratings_df[teaching_ratings_df['age_group'] == '60 years and older']['eval'], 
       center='mean')

LeveneResult(statistic=3.123930368994838, pvalue=0.04491850441786862)

f_oneway(teaching_ratings_df[teaching_ratings_df['age_group'] == '40 years and younger']['eval'], 
         teaching_ratings_df[teaching_ratings_df['age_group'] == 'between 40 and 60 years']['eval'], 
         teaching_ratings_df[teaching_ratings_df['age_group'] == '60 years and older']['eval'])

F_onewayResult(statistic=1.6792657352642264, pvalue=0.1876521827204442)

cross_tenure_gender = pd.crosstab(teaching_ratings_df['tenure'], teaching_ratings_df['gender'])
cross_tenure_gender

chi2_contingency(cross_tenure_gender, correction=False)

Chi2ContingencyResult(statistic=2.557051129789522, pvalue=0.10980322511302845, dof=1, expected_freq=array([[ 42.95896328,  59.04103672],
       [152.04103672, 208.95896328]]))

sns.lmplot(data=teaching_ratings_df, x="beauty", y="eval", line_kws={"color": "red"})
plt.show()

pearsonr(teaching_ratings_df['beauty'], teaching_ratings_df['eval'])

PearsonRResult(statistic=0.18903909084045206, pvalue=4.247115419813754e-05)

teaching_ratings_df.groupby("tenure")["eval"].describe()

ttest_ind(teaching_ratings_df[teaching_ratings_df['tenure'] == 'yes']['eval'],
          teaching_ratings_df[teaching_ratings_df['tenure'] == 'no']['eval'],
          equal_var=True)

TtestResult(statistic=-2.8046798258451777, pvalue=0.005249471210198792, df=461.0)

cross_tenure_age = pd.crosstab(teaching_ratings_df['tenure'], teaching_ratings_df['age_group'])
cross_tenure_age

chi2_contingency(cross_tenure_age, correction=True)

Chi2ContingencyResult(statistic=20.957740803528935, pvalue=2.8124473945785386e-05, dof=2, expected_freq=array([[ 24.89416847,  16.96328294,  60.1425486 ],
       [ 88.10583153,  60.03671706, 212.8574514 ]]))

cross_minority_tenure = pd.crosstab(teaching_ratings_df['minority'], teaching_ratings_df['tenure'])
cross_minority_tenure

chi2_contingency(cross_minority_tenure, correction=True)

Chi2ContingencyResult(statistic=1.3675127484429763, pvalue=0.24223968800237178, dof=1, expected_freq=array([[ 87.90064795, 311.09935205],
       [ 14.09935205,  49.90064795]]))

# X is the input variables (or independent variables)
X = teaching_ratings_df['gender'].map({"female": 1, "male": 0})

# y is the target/dependent variable
y = teaching_ratings_df['eval']

# Add an intercept (beta_0) to our model
X = sm.add_constant(X) 

# Ordinary Least Squares
model = sm.OLS(y, X).fit() 

# Print out the statistics
model.summary()

model = ols('beauty ~ age_group', data=teaching_ratings_df).fit()
anova_table = sm.stats.anova_lm(model)
anova_table

# X is the input variables (or independent variables)
X = teaching_ratings_df['tenure'].map({"yes": 1, "no": 0})

# y is the target/dependent variable
y = teaching_ratings_df['beauty']

# Add an intercept (beta_0) to our model
X = sm.add_constant(X) 

# Ordinary Least Squares
model = sm.OLS(y, X).fit() 

# Print out the statistics
model.summary()

# X is the input variables (or independent variables)
X = teaching_ratings_df["native"].map({"yes": 1, "no": 0})

# y is the target/dependent variable
y = teaching_ratings_df['allstudents']

# Add an intercept (beta_0) to our model
X = sm.add_constant(X) 

# Ordinary Least Squares
model = sm.OLS(y, X).fit() 

# Print out the statistics
model.summary()

Variable	Description
minority	Does the instructor belong to a minority (non-Caucasian) group?
age	The professor's age
gender	Indicating whether the instructor was male or female.
credits	Is the course a single-credit elective?
beauty	Rating of the instructor's physical appearance by a panel of six students averaged across the six panelists and standardized to have a mean of zero.
eval	Course overall teaching evaluation score, on a scale of 1 (very unsatisfactory) to 5 (excellent).
division	Is the course an upper or lower division course?
native	Is the instructor a native English speaker?
tenure	Is the instructor on a tenure track?
students	Number of students that participated in the evaluation.
allstudents	Number of students enrolled in the course.
prof	Indicating instructor identifier.

	age	beauty	eval	students	allstudents
count	463.000000	4.630000e+02	463.000000	463.000000	463.000000
mean	48.365011	6.271140e-08	3.998272	36.624190	55.177106
std	9.802742	7.886477e-01	0.554866	45.018481	75.072800
min	29.000000	-1.450494e+00	2.100000	5.000000	8.000000
25%	42.000000	-6.562689e-01	3.600000	15.000000	19.000000
50%	48.000000	-6.801430e-02	4.000000	23.000000	29.000000
75%	57.000000	5.456024e-01	4.400000	40.000000	60.000000
max	73.000000	1.970023e+00	5.000000	380.000000	581.000000

	count	mean	std	min	25%	50%	75%	max
age_group
40 years and younger	113.0	0.336196	0.913748	-1.450494	-0.326015	0.289916	1.070944	1.970023
60 years and older	77.0	-0.423557	0.548289	-1.422919	-0.733091	-0.395397	-0.056677	0.588569
between 40 and 60 years	273.0	-0.019693	0.728354	-1.090389	-0.583587	-0.083601	0.420400	1.774334

	count	mean	std	min	25%	50%	75%	max
age_group
40 years and younger	113.0	4.002655	0.505763	2.7	3.6	4.1	4.4	4.8
60 years and older	77.0	3.894805	0.626371	2.2	3.4	4.0	4.4	4.9
between 40 and 60 years	273.0	4.025641	0.551537	2.1	3.7	4.0	4.5	5.0

Dep. Variable:	eval	R-squared:	0.022
Model:	OLS	Adj. R-squared:	0.020
Method:	Least Squares	F-statistic:	10.56
Date:	Thu, 30 May 2024	Prob (F-statistic):	0.00124
Time:	14:04:22	Log-Likelihood:	-378.50
No. Observations:	463	AIC:	761.0
Df Residuals:	461	BIC:	769.3
Df Model:	1
Covariance Type:	nonrobust

Teaching ratings¶

Import libraries¶

Load the data¶

Data Description¶

Get information about each variable¶

Produce a descriptive statistics table¶

Create a histogram of the beauty variable¶

Does average beauty score differ by gender?¶

Calculate the percentage of males and females that are tenured professors¶

Calculate the percentage of minorities and non-minorities that are tenure professors¶

Does average age differ by tenure?¶

What is the mean evaluation score for tenured professors?¶

Do instructors teaching lower-division courses receive higher average teaching evaluations?¶

Create a box plot for beauty scores differentiated by credits¶

What is the number of courses taught by gender?¶

Create a group histogram of taught by gender and tenure¶

Create a group histogram of taught by gender, differentiated by tenure and division¶

Create a distribution plot of teaching evaluation score with gender as a factor¶

Create a box plot for age differentiated by gender¶

Compare age along with tenure and gender¶

Create a distribution plot of beauty scores with native english speaker as a factor¶

Create a box plot of the age of the instructors by visible minority¶

Create a group histogram of tenure by minority and add the gender factor¶

What is the probability of receiving an evaluation score of greater than 4.5?¶

What is the probability of receiving an evaluation score greater than 3.5 and less than 4.2?¶

Using t-test, does gender affect teaching evaluation rates?¶

Using ANOVA test, does beauty score for instructors differ by age?¶

Using ANOVA test, does teaching evaluation score for instructors differ by age?¶

Using chi-square, is there an association between tenure and gender?¶

Using Pearson correlation, is teaching evalution score correlated with beauty score?¶

Using t-test, does tenure affect teaching evaluation scores?¶

Using chi-square, is there an association between age and tenure?¶

Using chi-square, is there an association between visible minorities and tenure?¶

Using regression with t-test, does gender affect teaching evaluation score?¶

Using regression with ANOVA, does beauty score for instructors differ by age?¶

Using regression with t-test, does tenure affect beauty scores?¶

Using regression with t-test, does an english speaker affect the number of students assigned to professors?¶

	minority	age	gender	credits	beauty	eval	division	native	tenure	students	allstudents	prof
0	yes	36	female	more	0.289916	4.3	upper	yes	yes	24	43	1
1	yes	36	female	more	0.289916	3.7	upper	yes	yes	86	125	1
2	yes	36	female	more	0.289916	3.6	upper	yes	yes	76	125	1
3	yes	36	female	more	0.289916	4.4	upper	yes	yes	77	123	1
4	no	59	male	more	-0.737732	4.5	upper	yes	yes	17	20	2

	count	mean	std	min	25%	50%	75%	max
gender
female	195.0	3.901026	0.538803	2.3	3.6	3.90	4.3	4.9
male	268.0	4.069030	0.556652	2.1	3.7	4.15	4.5	5.0

	count	mean	std	min	25%	50%	75%	max
tenure
no	102.0	4.133333	0.556747	2.8	3.7	4.2	4.6	5.0
yes	361.0	3.960111	0.549104	2.1	3.6	4.0	4.4	5.0

	coef	std err	t	P>\|t\|	[0.025	0.975]
const	4.0690	0.034	121.288	0.000	4.003	4.135
gender	-0.1680	0.052	-3.250	0.001	-0.270	-0.066

Omnibus:	17.625	Durbin-Watson:	1.209
Prob(Omnibus):	0.000	Jarque-Bera (JB):	18.970
Skew:	-0.496	Prob(JB):	7.60e-05
Kurtosis:	2.981	Cond. No.	2.47

	df	sum_sq	mean_sq	F	PR(>F)
age_group	2.0	26.691809	13.345905	23.552552	1.827113e-10
Residual	460.0	260.656087	0.566644	NaN	NaN

Omnibus:	23.184	Durbin-Watson:	0.461
Prob(Omnibus):	0.000	Jarque-Bera (JB):	23.229
Skew:	0.507	Prob(JB):	9.03e-06
Kurtosis:	2.583	Cond. No.	4.05

Omnibus:	429.792	Durbin-Watson:	0.708
Prob(Omnibus):	0.000	Jarque-Bera (JB):	10527.126
Skew:	4.129	Prob(JB):	0.00
Kurtosis:	24.852	Cond. No.	8.01

	coef	std err	t	P>\|t\|	[0.025	0.975]
const	0.0284	0.078	0.363	0.717	-0.125	0.182
tenure	-0.0364	0.089	-0.411	0.681	-0.210	0.138

	coef	std err	t	P>\|t\|	[0.025	0.975]
const	29.6071	14.150	2.092	0.037	1.802	57.413
native	27.2158	14.598	1.864	0.063	-1.471	55.902