# https://www.kaggle.com/yasserh/housing-prices-dataset
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
#Modelling
import statsmodels.formula.api as smf
# df_mat = pd.read_csv('/kaggle/input/student-performance/student-mat.csv', delimiter=';')
data = pd.read_csv('C:/Users/Stevie/Data_Analysis/Regressions/Housing.csv')
No missing values (na)
data.isna().sum()
price 0 area 0 bedrooms 0 bathrooms 0 stories 0 mainroad 0 guestroom 0 basement 0 hotwaterheating 0 airconditioning 0 parking 0 prefarea 0 furnishingstatus 0 dtype: int64
data.head()
price | area | bedrooms | bathrooms | stories | mainroad | guestroom | basement | hotwaterheating | airconditioning | parking | prefarea | furnishingstatus | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 13300000 | 7420 | 4 | 2 | 3 | yes | no | no | no | yes | 2 | yes | furnished |
1 | 12250000 | 8960 | 4 | 4 | 4 | yes | no | no | no | yes | 3 | no | furnished |
2 | 12250000 | 9960 | 3 | 2 | 2 | yes | no | yes | no | no | 2 | yes | semi-furnished |
3 | 12215000 | 7500 | 4 | 2 | 2 | yes | no | yes | no | yes | 3 | yes | furnished |
4 | 11410000 | 7420 | 4 | 1 | 2 | yes | yes | yes | no | yes | 2 | no | furnished |
Distribution of hoses with and without basic amenities (hotwater/airconditioning/basement)
f = plt.figure(figsize=(14,6))
ax = f.add_subplot(121)
sns.distplot(data['price'],bins=50)
plt.title("Distribution of housing prices")
ax = f.add_subplot(122)
sns.distplot(np.log10(data['price']),bins=50,color='red')
plt.title("Distribution of $log$ housing prices")
C:\Users\Stevie\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning) C:\Users\Stevie\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
Text(0.5, 1.0, 'Distribution of $log$ housing prices')
Distribution of the variable seesm to be normally distribution. I will be using this as the dependent variable in my regression analysis
Distribution of housing prices given their number of parking slots
plt.figure(figsize=(14,8))
plt.hist((data[data['parking']==1].price,data[data['parking']==2].price,data[data['parking']==3].price),
color=['#3f7ea6',"#023059",'#1b6f1b'])
plt.legend(['1 Parking slot','2 Parking slots','3 Parking slots'])
plt.xlabel("Housing Price")
plt.ylabel('Number of obs')
plt.title('Distribution of housing prices given the number of parking slots')
Text(0.5, 1.0, 'Distribution of housing prices given the number of parking slots')
Categorical variables
columns = ['mainroad','guestroom','basement','airconditioning','hotwaterheating','prefarea','furnishingstatus']
for i in columns:
plt.figure(figsize=(3,1))
data.groupby(i)[i].count().plot(kind="barh")
plt.title(i)
data.head()
price | area | bedrooms | bathrooms | stories | mainroad | guestroom | basement | hotwaterheating | airconditioning | parking | prefarea | furnishingstatus | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 13300000 | 7420 | 4 | 2 | 3 | yes | no | no | no | yes | 2 | yes | furnished |
1 | 12250000 | 8960 | 4 | 4 | 4 | yes | no | no | no | yes | 3 | no | furnished |
2 | 12250000 | 9960 | 3 | 2 | 2 | yes | no | yes | no | no | 2 | yes | semi-furnished |
3 | 12215000 | 7500 | 4 | 2 | 2 | yes | no | yes | no | yes | 3 | yes | furnished |
4 | 11410000 | 7420 | 4 | 1 | 2 | yes | yes | yes | no | yes | 2 | no | furnished |
#no = 0
# yes = 1
c = ['mainroad','guestroom','basement','hotwaterheating','airconditioning','prefarea']
for i in c:
data.loc[data[i]=='yes',i] = 1
data.loc[data[i]=='no',i] = 0
Dummy variables
df_encode = pd.get_dummies(data,
prefix= '',
prefix_sep='',
columns= ['furnishingstatus'],
drop_first=True, #remove if you want 0 to be a true value
dtype='int8')
Correlation Heatmap
plt.figure(figsize=(14,8))
sns.heatmap(df_encode.corr(),annot=True)
plt.title('Correlation Heatmap')
Text(0.5, 1.0, 'Correlation Heatmap')
df_encode = df_encode.rename(columns={"semi-furnished":'semifurnished'}) #renaming
df_encode
price | area | bedrooms | bathrooms | stories | mainroad | guestroom | basement | hotwaterheating | airconditioning | parking | prefarea | semifurnished | unfurnished | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 13300000 | 7420 | 4 | 2 | 3 | 1 | 0 | 0 | 0 | 1 | 2 | 1 | 0 | 0 |
1 | 12250000 | 8960 | 4 | 4 | 4 | 1 | 0 | 0 | 0 | 1 | 3 | 0 | 0 | 0 |
2 | 12250000 | 9960 | 3 | 2 | 2 | 1 | 0 | 1 | 0 | 0 | 2 | 1 | 1 | 0 |
3 | 12215000 | 7500 | 4 | 2 | 2 | 1 | 0 | 1 | 0 | 1 | 3 | 1 | 0 | 0 |
4 | 11410000 | 7420 | 4 | 1 | 2 | 1 | 1 | 1 | 0 | 1 | 2 | 0 | 0 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
540 | 1820000 | 3000 | 2 | 1 | 1 | 1 | 0 | 1 | 0 | 0 | 2 | 0 | 0 | 1 |
541 | 1767150 | 2400 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
542 | 1750000 | 3620 | 2 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
543 | 1750000 | 2910 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
544 | 1750000 | 3850 | 3 | 1 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
545 rows × 14 columns
sns.distplot(data['area'])
C:\Users\Stevie\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
<AxesSubplot:xlabel='area', ylabel='Density'>
import statsmodels.formula.api as smf
df_encode['price'] = np.log(df_encode['price'])
mod = smf.ols(formula='price~unfurnished+stories+parking+bedrooms+bathrooms+mainroad',data=df_encode)
res = mod.fit()
print(res.summary())
OLS Regression Results ============================================================================== Dep. Variable: price R-squared: 0.499 Model: OLS Adj. R-squared: 0.493 Method: Least Squares F-statistic: 89.14 Date: Thu, 20 Jan 2022 Prob (F-statistic): 2.14e-77 Time: 15:58:28 Log-Likelihood: -46.057 No. Observations: 545 AIC: 106.1 Df Residuals: 538 BIC: 136.2 Df Model: 6 Covariance Type: nonrobust ================================================================================= coef std err t P>|t| [0.025 0.975] --------------------------------------------------------------------------------- Intercept 14.4642 0.059 243.658 0.000 14.348 14.581 mainroad[T.1] 0.2370 0.034 6.999 0.000 0.170 0.304 unfurnished -0.1591 0.025 -6.393 0.000 -0.208 -0.110 stories 0.0930 0.015 6.274 0.000 0.064 0.122 parking 0.0924 0.014 6.669 0.000 0.065 0.120 bedrooms 0.0594 0.018 3.347 0.001 0.025 0.094 bathrooms 0.2200 0.025 8.721 0.000 0.170 0.270 ============================================================================== Omnibus: 2.796 Durbin-Watson: 0.978 Prob(Omnibus): 0.247 Jarque-Bera (JB): 2.611 Skew: -0.130 Prob(JB): 0.271 Kurtosis: 3.218 Cond. No. 23.2 ============================================================================== Notes: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
I have excluded area because i believe it is highly correlated with all other amenities. Adding it to the regression model might cause multicollinearities
Results suggests that all my variables except for the intercept are significant at the 95% level. R^2 suggests that the model explains 50% of the data variation.
sns.scatterplot(x=res.fittedvalues,y=res.resid)
plt.title('Check for Heteroskedasticity:\n Fitted vs resid')
plt.xlabel('fitted')
plt.ylabel('resid')
Text(0, 0.5, 'resid')
Residuals seems to be scattered randomly when fitted value increases. No signs of heteroskedasticity
Parameters
res.params
Intercept 14.464198 mainroad[T.1] 0.236998 unfurnished -0.159127 stories 0.093023 parking 0.092431 bedrooms 0.059434 bathrooms 0.219987 dtype: float64
plt.figure(figsize=(14,6))
plt.plot(res.predict())
plt.plot(df_encode['price'])
plt.legend(['OLS','Price Distribution'],fontsize=13)
plt.title('Price Fluctuation and OLS prediction')
Text(0.5, 1.0, 'Price Fluctuation and OLS prediction')
import statsmodels.api as sm
fig = plt.figure(figsize=(14, 10))
sm.graphics.plot_ccpr_grid(res,fig=fig)
fig.tight_layout(pad=2.0)
Price ~ Area (OLS)
import statsmodels.api as sm
olsmod = sm.OLS(data['price'], data['area'])
olsres = olsmod.fit()
print(olsres.summary())
OLS Regression Results ======================================================================================= Dep. Variable: price R-squared (uncentered): 0.872 Model: OLS Adj. R-squared (uncentered): 0.872 Method: Least Squares F-statistic: 3717. Date: Thu, 20 Jan 2022 Prob (F-statistic): 2.64e-245 Time: 17:26:02 Log-Likelihood: -8632.0 No. Observations: 545 AIC: 1.727e+04 Df Residuals: 544 BIC: 1.727e+04 Df Model: 1 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ area 855.7099 14.036 60.967 0.000 828.139 883.281 ============================================================================== Omnibus: 45.221 Durbin-Watson: 1.440 Prob(Omnibus): 0.000 Jarque-Bera (JB): 117.437 Skew: -0.411 Prob(JB): 3.15e-26 Kurtosis: 5.120 Cond. No. 1.00 ============================================================================== Notes: [1] R² is computed without centering (uncentered) since the model does not contain a constant. [2] Standard Errors assume that the covariance matrix of the errors is correctly specified.
from statsmodels.graphics.regressionplots import abline_plot
plt.figure(figsize=(14,6))
sns.regplot(x='area',y='price',data=data)
plt.plot(data['area'],olsres.predict(data['area']),color='r')
plt.legend(['sns','OLS'],fontsize=10)
<matplotlib.legend.Legend at 0x258c84835e0>