# https://www.kaggle.com/yasserh/housing-prices-dataset


import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

#Modelling 
import statsmodels.formula.api as smf


# df_mat = pd.read_csv('/kaggle/input/student-performance/student-mat.csv', delimiter=';')
data = pd.read_csv('C:/Users/Stevie/Data_Analysis/Regressions/Housing.csv')


data.isna().sum()

price               0
area                0
bedrooms            0
bathrooms           0
stories             0
mainroad            0
guestroom           0
basement            0
hotwaterheating     0
airconditioning     0
parking             0
prefarea            0
furnishingstatus    0
dtype: int64


data.head()


f = plt.figure(figsize=(14,6))

ax = f.add_subplot(121)
sns.distplot(data['price'],bins=50)
plt.title("Distribution of housing prices")

ax = f.add_subplot(122)
sns.distplot(np.log10(data['price']),bins=50,color='red')
plt.title("Distribution of $log$ housing prices")

C:\Users\Stevie\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)
C:\Users\Stevie\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)

Text(0.5, 1.0, 'Distribution of $log$ housing prices')


plt.figure(figsize=(14,8))
plt.hist((data[data['parking']==1].price,data[data['parking']==2].price,data[data['parking']==3].price),
        color=['#3f7ea6',"#023059",'#1b6f1b'])

plt.legend(['1 Parking slot','2 Parking slots','3 Parking slots'])
plt.xlabel("Housing Price")
plt.ylabel('Number of obs')
plt.title('Distribution of housing prices given the number of parking slots')

Text(0.5, 1.0, 'Distribution of housing prices given the number of parking slots')


Categorical variables


columns = ['mainroad','guestroom','basement','airconditioning','hotwaterheating','prefarea','furnishingstatus']
for i in columns: 
    plt.figure(figsize=(3,1))
    data.groupby(i)[i].count().plot(kind="barh")
    plt.title(i)


data.head()


#no = 0
# yes = 1

c = ['mainroad','guestroom','basement','hotwaterheating','airconditioning','prefarea']

for i in c:
    data.loc[data[i]=='yes',i] = 1
    data.loc[data[i]=='no',i] = 0


df_encode = pd.get_dummies(data,
                           prefix= '',
                           prefix_sep='',
                           columns= ['furnishingstatus'],
                           drop_first=True, #remove if you want 0 to be a true value
                           dtype='int8')


plt.figure(figsize=(14,8))
sns.heatmap(df_encode.corr(),annot=True)
plt.title('Correlation Heatmap')

Text(0.5, 1.0, 'Correlation Heatmap')


df_encode = df_encode.rename(columns={"semi-furnished":'semifurnished'}) #renaming


df_encode


sns.distplot(data['area'])

C:\Users\Stevie\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)

<AxesSubplot:xlabel='area', ylabel='Density'>


import statsmodels.formula.api as smf
df_encode['price'] = np.log(df_encode['price'])

mod = smf.ols(formula='price~unfurnished+stories+parking+bedrooms+bathrooms+mainroad',data=df_encode)
res = mod.fit()
print(res.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                  price   R-squared:                       0.499
Model:                            OLS   Adj. R-squared:                  0.493
Method:                 Least Squares   F-statistic:                     89.14
Date:                Thu, 20 Jan 2022   Prob (F-statistic):           2.14e-77
Time:                        15:58:28   Log-Likelihood:                -46.057
No. Observations:                 545   AIC:                             106.1
Df Residuals:                     538   BIC:                             136.2
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
=================================================================================
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
Intercept        14.4642      0.059    243.658      0.000      14.348      14.581
mainroad[T.1]     0.2370      0.034      6.999      0.000       0.170       0.304
unfurnished      -0.1591      0.025     -6.393      0.000      -0.208      -0.110
stories           0.0930      0.015      6.274      0.000       0.064       0.122
parking           0.0924      0.014      6.669      0.000       0.065       0.120
bedrooms          0.0594      0.018      3.347      0.001       0.025       0.094
bathrooms         0.2200      0.025      8.721      0.000       0.170       0.270
==============================================================================
Omnibus:                        2.796   Durbin-Watson:                   0.978
Prob(Omnibus):                  0.247   Jarque-Bera (JB):                2.611
Skew:                          -0.130   Prob(JB):                        0.271
Kurtosis:                       3.218   Cond. No.                         23.2
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.


sns.scatterplot(x=res.fittedvalues,y=res.resid)
plt.title('Check for Heteroskedasticity:\n Fitted vs resid')
plt.xlabel('fitted')
plt.ylabel('resid')

Text(0, 0.5, 'resid')


res.params

Intercept        14.464198
mainroad[T.1]     0.236998
unfurnished      -0.159127
stories           0.093023
parking           0.092431
bedrooms          0.059434
bathrooms         0.219987
dtype: float64


plt.figure(figsize=(14,6))
plt.plot(res.predict())
plt.plot(df_encode['price'])
plt.legend(['OLS','Price Distribution'],fontsize=13)
plt.title('Price Fluctuation and OLS prediction')

Text(0.5, 1.0, 'Price Fluctuation and OLS prediction')


import statsmodels.api as sm

fig = plt.figure(figsize=(14, 10))
sm.graphics.plot_ccpr_grid(res,fig=fig)
fig.tight_layout(pad=2.0)


import statsmodels.api as sm
olsmod = sm.OLS(data['price'], data['area'])
olsres = olsmod.fit()
print(olsres.summary())

                                 OLS Regression Results                                
=======================================================================================
Dep. Variable:                  price   R-squared (uncentered):                   0.872
Model:                            OLS   Adj. R-squared (uncentered):              0.872
Method:                 Least Squares   F-statistic:                              3717.
Date:                Thu, 20 Jan 2022   Prob (F-statistic):                   2.64e-245
Time:                        17:26:02   Log-Likelihood:                         -8632.0
No. Observations:                 545   AIC:                                  1.727e+04
Df Residuals:                     544   BIC:                                  1.727e+04
Df Model:                           1                                                  
Covariance Type:            nonrobust                                                  
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
area         855.7099     14.036     60.967      0.000     828.139     883.281
==============================================================================
Omnibus:                       45.221   Durbin-Watson:                   1.440
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              117.437
Skew:                          -0.411   Prob(JB):                     3.15e-26
Kurtosis:                       5.120   Cond. No.                         1.00
==============================================================================

Notes:
[1] R² is computed without centering (uncentered) since the model does not contain a constant.
[2] Standard Errors assume that the covariance matrix of the errors is correctly specified.


from statsmodels.graphics.regressionplots import abline_plot

plt.figure(figsize=(14,6))
sns.regplot(x='area',y='price',data=data)
plt.plot(data['area'],olsres.predict(data['area']),color='r')
plt.legend(['sns','OLS'],fontsize=10)

<matplotlib.legend.Legend at 0x258c84835e0>

	price	area	bedrooms	bathrooms	stories	mainroad	guestroom	basement	hotwaterheating	airconditioning	parking	prefarea	semifurnished	unfurnished
0	13300000	7420	4	2	3	1	0	0	0	1	2	1	0	0
1	12250000	8960	4	4	4	1	0	0	0	1	3	0	0	0
2	12250000	9960	3	2	2	1	0	1	0	0	2	1	1	0
3	12215000	7500	4	2	2	1	0	1	0	1	3	1	0	0
4	11410000	7420	4	1	2	1	1	1	0	1	2	0	0	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
540	1820000	3000	2	1	1	1	0	1	0	0	2	0	0	1
541	1767150	2400	3	1	1	0	0	0	0	0	0	0	1	0
542	1750000	3620	2	1	1	1	0	0	0	0	0	0	0	1
543	1750000	2910	3	1	1	0	0	0	0	0	0	0	0	0
544	1750000	3850	3	1	2	1	0	0	0	0	0	0	0	1

Data Cleaning¶

Plot¶

Data Processing¶

Data Modelling OLS¶

Variables used:¶

Prediction¶

Alternative for single linear regression and plots¶

	price	area	bedrooms	bathrooms	stories	mainroad	guestroom	basement	hotwaterheating	airconditioning	parking	prefarea	furnishingstatus
0	13300000	7420	4	2	3	yes	no	no	no	yes	2	yes	furnished
1	12250000	8960	4	4	4	yes	no	no	no	yes	3	no	furnished
2	12250000	9960	3	2	2	yes	no	yes	no	no	2	yes	semi-furnished
3	12215000	7500	4	2	2	yes	no	yes	no	yes	3	yes	furnished
4	11410000	7420	4	1	2	yes	yes	yes	no	yes	2	no	furnished