Real State Analysis¶

-Data Source: ucimlrepo

In this notebook we will make analysis of the price of some real state assets located in New Taipei / China.

In [1]:
import folium
import random
import pandas as pd
import numpy as np
from folium.plugins import HeatMap, HeatMapWithTime
from ucimlrepo import fetch_ucirepo, list_available_datasets, dotdict, fetch
from datetime import datetime
import math
real_state_valuation = fetch_ucirepo(id=477)

df = real_state_valuation.data.original.copy()


def date_transformation(date):
    year = int(date.split('.')[0])
    month = int(date.split('.')[1])
    day = month
    proportion = 12 / 1000
    converted_month = math.floor(proportion * month)
    if converted_month == 0:
        converted_month = 1
    date = datetime(year=year, month=converted_month, day=1)
    return date


df['X1 transaction date'] = df['X1 transaction date'].apply(lambda date: date_transformation(str(date)))
df.drop(columns=['No'], inplace=True)
df['X1 transaction date'] = pd.to_datetime(df['X1 transaction date'])
df.sort_values(by='X1 transaction date', axis=0, inplace=True)
df.reset_index(drop=True, inplace=True)
prices = df['Y house price of unit area'].to_list()
color_options = {
    "red",
    "darkred",
    "lightred",
    "orange",
    "beige",
    "green",
    "darkgreen",
    "lightgreen",
    "blue",
    "darkblue",
    "cadetblue",
    "lightblue",
    "purple",
    "darkpurple",
    "pink",
    "white",
    "gray",
    "lightgray",
    "black",
}

min_price = min(prices)
max_price = max(prices)
intermediate_price = np.median(prices)
num_categorias = 5
df.sort_values(by='Y house price of unit area', ascending=True, inplace=True)
df.reset_index(inplace=True, drop=True, )
# Categorizar los números en intervalos aproximadamente iguales
intervalos = [min_price, intermediate_price, max_price, np.inf]

etiquetas = ['Super Barata', 'Barata', 'Media', 'Cara', 'Super Cara']
colores = ['lightgreen', 'green', 'darkgreen', 'red', 'darkred']
calor = len(df.index.values)
pesos = df.index.values

# categorias = pd.cut(prices, bins=intervalos, labels=etiquetas)
df = df.assign(categorias=pd.qcut(df['Y house price of unit area'], q=num_categorias, labels=etiquetas))

df = df.assign(colores_zone=pd.qcut(df['Y house price of unit area'], q=num_categorias, labels=colores))

df['weights'] = (pesos + 1)

lat = df['X5 latitude'].to_list()
lon = df['X6 longitude'].to_list()

max_lat = max(lat) + 0.007
min_lat = min(lat) - 0.007
max_lon = max(lon) + 0.007
min_lon = min(lon) - 0.007

m = folium.Map(location=[24.96515, 121.53737], zoom_start=13,
               control_scale=True,
               scrollWheelZoom=False)

feature_gp1 = folium.FeatureGroup(name="icons")
feature_gp2 = folium.FeatureGroup(name="heat zone")

for i in df.index:
    latitud = df.loc[df.index == i]['X5 latitude'].to_list()[0]
    longitud = df.loc[df.index == i]['X6 longitude'].to_list()[0]
    peso = df.loc[df.index == i]['weights'].to_list()[0]
    price = df.loc[df.index == i]['Y house price of unit area'].to_list()[0]
    age = df.loc[df.index == i]['X2 house age'].to_list()[0]
    colores = df.loc[df.index == i]['colores_zone'].to_list()[0]
    icon_color = ["#" + ''.join([random.choice('ABCDEF0123456789') for i in range(6)])]
    iconos = folium.Icon(
        color=colores, prefix='fa', icon='fa-house',
        # icon_color=icon_color
    )

    marker = folium.Marker(
        location=[latitud, longitud],
        # coordinates for the marker (Earth Lab at CU Boulder)
        popup=(latitud, longitud),
        # pop-up label for the marker
        # icon=number_DivIcon(col_hex[num], num),
        icon=iconos,
        tooltip=f'Antiquity: {age} years')
    marker.add_to(feature_gp1)

ls = folium.PolyLine(
    locations=[[max_lat, max_lon],
               [max_lat, min_lon],
               [min_lat, min_lon],
               [min_lat, max_lon],
               [max_lat, max_lon]], color="purple"
)

df.sort_values(by='weights', ascending=False)
my_list = [[df[df.index == i]['X5 latitude'].to_list()[0], df[df.index == i]['X6 longitude'].to_list()[0],
            df[df.index == i]['weights'].to_list()[0],
            ] for i in df.index.values]
feature_gp2.add_child(HeatMap(data=my_list, radius=26))

# print(my_list)
# feature_gp2.add_child(HeatMap(data=my_list[:5] , radius=25, blur = 1, min_opacity = 0.5, ))
# m = folium.Map((24.96172, 121.53812), zoom_start=12,)

ls.add_child(folium.Popup("outline Popup on Polyline"))
m.add_child(ls)
fg = folium.FeatureGroup(show=False)
feature_gp2.add_to(m)
feature_gp1.add_to(m)
folium.LayerControl().add_to(m)
# m = folium.Map((24.96515, 121.53737), zoom_start=14, tiles=None, control_scale=True,
#                scrollWheelZoom=False)
# HeatMap(
#     # make five dots with different weights: 1, 2, 3, 4 and 5
#     data=my_list[:50], radius=25, blur=10, min_opacity=0).add_to(m)
folium.LatLngPopup().add_to(m)

m
Out[1]:
Make this Notebook Trusted to load map: File -> Trust Notebook

Here, we've already added our data to a folium map, and we have used the available colors for a marker to represent in a intuitive way the prices of each house. Also, a heat map has been added.

In [1]:
import osmnx as ox
from geopy.geocoders import Photon

geolocator = Photon(user_agent="measurements")

latitud = str(df.loc[df['X2 house age'] == 13.9]['X5 latitude'].to_list()[0])
longitud = str(df.loc[df['X2 house age'] == 13.9]['X6 longitude'].to_list()[0])

location = geolocator.reverse(latitud + "," + longitud)
address = location.raw['properties']['city']
print(address)
# ox.config(use_cache=True, log_console=True)
ox.settings.log_console = True
ox.settings.use_cache = True

# define the place query
query = {'city': address}

# get the boundaries of the place
gdf = ox.geocode_to_gdf(query)
gdf.plot()
新北市
Out[1]:
<Axes: >
No description has been provided for this image

Here, even though we know the location of our data by the source, if we are handling unknown coordinates, this code tells us the city and generates a Geodataframe to work with.

In [1]:
df.columns = ['transaction_date', 'age', 'distance_to_the_nearest_MRT_station', 'number_of_convenience_stores', 'lat',
              'lon', 'price', 'category', 'color', 'weights']
# print(df.to_string())
muy_baratas = df.groupby(by='category', observed=True).agg(
    {'number_of_convenience_stores': 'mean', 'distance_to_the_nearest_MRT_station': 'mean', 'price': 'mean',
     'age': 'mean'})
muy_baratas['category'] = muy_baratas.index.values
print(muy_baratas.to_string())
import plotly.express as px

# df = px.data.gapminder()

fig = px.scatter(muy_baratas, x="age", y="price",
                 size="number_of_convenience_stores",
                 hover_name="category", log_x=True, size_max=100, color='category')
fig.show('png')
              number_of_convenience_stores  distance_to_the_nearest_MRT_station      price        age      category
category                                                                                                           
Super Barata                      1.240964                          2804.101812  19.802410  19.500000  Super Barata
Barata                            2.879518                          1354.403169  29.953012  17.818072        Barata
Media                             4.771084                           542.854812  38.563855  23.844578         Media
Cara                              5.548780                           450.979636  44.628049  16.601220          Cara
Super Cara                        6.048193                           259.463642  57.033735  10.785542    Super Cara
No description has been provided for this image

Here we've already grouped the data by every category and calculated the mean values of each parameter, we can see that the most expensive houses are the ones that are located in the city, and the mos cheap are the ones in the periphery.

-Expensive houses are newer than the cheaper houses

-Even though a house can be recently build, the price depends hugely on its location

-This also tell us that it is more likely that a house that has a near convenience store is more expensive

Let's make a more in depth analysis and check the houses in the downton, where most of the data is located.

In [1]:
import numpy as np
import math
import scipy.spatial.distance as distance
import geopy.distance

# Supongamos que tienes una lista de coordenadas (latitud, longitud)
# points = [(lat1, lon1), (lat2, lon2), ...]
points = list(zip(df['lat'], df['lon']))


# df['geometry'] = [Point(lat, lon) for lon, lat in zip(df['lat'], df['lon'])]
# 1. Calcular la matriz de distancias
def get_distance(point1, point2):
    R = 6371  # Radio de la Tierra en kilómetros
    lat1, lon1 = np.radians(point1)
    lat2, lon2 = np.radians(point2)
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = np.sin(dlat / 2) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2) ** 2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    return R * c


all_points = np.array(points)

dist_matrix = distance.cdist(all_points, all_points, get_distance)
center_index = np.argmin(dist_matrix.sum(axis=1))
center_point = points[center_index]

radius = 1000
from haversine import haversine, Unit


def calcular_nuevas_coordenadas(latitud, longitud, distancia_vertical, distancia_horizontal):
    nueva_latitud = latitud + (distancia_vertical / 111.32)  # Aproximadamente 111.32 km por grado de latitud

    nueva_longitud = longitud + (distancia_horizontal / (100))
    return nueva_latitud, nueva_longitud


centro_circunferencia = (center_point[0], center_point[1])

lat_axis, lon_axis = calcular_nuevas_coordenadas(center_point[0], center_point[1], distancia_vertical=1,
                                                 distancia_horizontal=1)
lat_axis_ngtv, lon_axis_ngtv = calcular_nuevas_coordenadas(center_point[0], center_point[1], distancia_vertical=-1,
                                                           distancia_horizontal=-1)

x_limit = lat_axis - center_point[0]
y_limit = lon_axis - center_point[1]
selected_points = df.copy()
for i in selected_points.index.values:
    latitud = selected_points.loc[selected_points.index == i]['lat'].to_list()[0]
    longitud = selected_points.loc[selected_points.index == i]['lon'].to_list()[0]
    if geopy.distance.geodesic((latitud, longitud), (center_point[0], center_point[1])).kilometers > 1:
        selected_points.drop(axis=0, index=i, inplace=True)
    else:
        continue

df.columns = ['transaction_date', 'age', 'distance_to_the_nearest_MRT_station', 'number_of_convenience_stores', 'lat',
              'lon', 'price', 'category', 'color', 'weights']

m = folium.Map(location=[24.96515, 121.53737], zoom_start=13,
               control_scale=True,
               scrollWheelZoom=False)
for i in selected_points.index:
    latitud = selected_points.loc[selected_points.index == i]['lat'].to_list()[0]
    longitud = selected_points.loc[selected_points.index == i]['lon'].to_list()[0]
    peso = selected_points.loc[selected_points.index == i]['weights'].to_list()[0]
    price = selected_points.loc[selected_points.index == i]['price'].to_list()[0]
    age = selected_points.loc[selected_points.index == i]['age'].to_list()[0]
    colores = selected_points.loc[selected_points.index == i]['color'].to_list()[0]
    # icon_color = ["#" + ''.join([random.choice('ABCDEF0123456789') for i in range(6)])]
    iconos = folium.Icon(
        color=colores, prefix='fa', icon='fa-house',
    )

    marker = folium.Marker(
        location=[latitud, longitud],
        popup=peso,
        icon=iconos,
        tooltip=f'Antiquity: {age} years')
    marker.add_to(m)

iconos = folium.Icon(
    color='pink', prefix='fa', icon='fa-house', )

folium.Circle(
    location=[center_point[0], center_point[1]],
    radius=radius,
    fill_color="cornflowerblue",

    fill=False,
    popup="{} km".format(radius),
).add_to(m)

m
Out[1]:
Make this Notebook Trusted to load map: File -> Trust Notebook

Here we've calculated the central point of our coordinates, (Ergo, the point where the distance to each point is minnimal) and reduced our data to a certain circumference, to grasp more data where we have more density of houses.

In [1]:
selected_points_agg = selected_points.groupby(by='category', observed=True).agg(
    {'number_of_convenience_stores': 'mean', 'distance_to_the_nearest_MRT_station': 'mean', 'price': 'mean',
     'age': 'mean'})
print(selected_points_agg.to_string())
            number_of_convenience_stores  distance_to_the_nearest_MRT_station      price        age
category                                                                                           
Barata                          3.950000                           446.603880  31.370000  26.170000
Media                           5.070175                           443.935544  38.726316  25.689474
Cara                            6.070175                           317.435891  44.433333  17.573684
Super Cara                      5.983051                           252.142486  58.230508  10.827119

We can see that if we reduce our data to a 1 km radius, with the same aggregation of before, we realize that in downton, the most predominant parameter to define the price is not the number of convenience stores nearby, is the distance to the nearest metro station, and the age of the house.

Let's look to a more specific insight.

In [1]:
casas_nuevas = selected_points.copy()
casas_nuevas.drop(columns=['transaction_date', 'category', 'color', 'weights'], axis=1, inplace=True)
print(casas_nuevas.corr().to_string())
                                          age  distance_to_the_nearest_MRT_station  number_of_convenience_stores       lat       lon     price
age                                  1.000000                             0.273330                      0.074861  0.138400 -0.109839 -0.326439
distance_to_the_nearest_MRT_station  0.273330                             1.000000                     -0.260047  0.225971 -0.108378 -0.343903
number_of_convenience_stores         0.074861                            -0.260047                      1.000000  0.056987  0.690952  0.167605
lat                                  0.138400                             0.225971                      0.056987  1.000000 -0.078110  0.084663
lon                                 -0.109839                            -0.108378                      0.690952 -0.078110  1.000000  0.204144
price                               -0.326439                            -0.343903                      0.167605  0.084663  0.204144  1.000000

This is a super useful tool, one of the cores of statistics, the correlation or covariance tells us how strongly bonded are two variables, and in which manner, if the correlation is 1, it means that the variables have an increasing relation, if one increases the other does, and viceversa with -1, and a 0 means that they behaviour can't be related in any way.

We will see these relations in a more visual way, by using a matrix where both axis correspond to each variable, and every intersection shows a scatter plot of the relation between these two variables.

In [1]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.pairplot(selected_points, diag_kind='kde')
plt.show()
No description has been provided for this image

We can see that the tables of every element of the matrix where the same variables intersect, are the distributions of that variable, ergo, in which values is mostly located the data of our variable.

Also, where the data is more scattered, is where the relation between those variables weaker, where the data tends to grow or decrease on the Y axis as we move along the X axis, is where our correlation tends to 1 or -1.

In [1]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.histplot(selected_points['age'], bins=20, )
plt.xlabel('Edad de la casa')
plt.ylabel('Frecuencia')
plt.title('Distribución de la edad de las casas')
plt.show()
No description has been provided for this image

Here we can see a basic view of the distribution of the age of the house that happens to describe the data distribution of every scatter of our age variable along the X axis.

In [1]:
sns.scatterplot(x='age', y='price', data=selected_points)
plt.xlabel('Edad')
plt.ylabel(
    'Precio de la casa'
)
plt.title('Relación entre edad y precio')
plt.show()
No description has been provided for this image

Finally, we will se if we can make some forecasting about the prices as the age goes on, and find what cases made our correlation not so strong. Let's sort our dataframe by age. (It's an important factor the covariance between age and price, which is -0.32, meaning that if the age grows, the price is likely to go down)

In [1]:
from sklearn.cluster import DBSCAN

X = [[df[df.index == i]['age'].to_list()[0], df[df.index == i]['price'].to_list()[0],
            ] for i in df.index.values]
X = np.array(X)


db = DBSCAN(eps=9, min_samples=10).fit(X)
labels = db.labels_
# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)

# print("Number of clusters: %d" % n_clusters_)
# print("Number of noise points: %d" % n_noise_)
unique_labels = set(labels)
core_samples_mask = np.zeros_like(labels, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
colors = [plt.cm.Spectral(each) for each in np.linspace(0, 1, len(unique_labels))]

indices_falsos = []

# Itera sobre la lista de booleanos
for i, valor in enumerate(list(labels)):
    if valor:
        indices_falsos.append(i)

for k, col in zip(unique_labels, colors):
    if k == -1:
        # Black used for noise.
        col = [0, 0, 0, 1]

    class_member_mask = labels == k
    xy_ = X[class_member_mask & core_samples_mask]
    # print(class_member_mask & core_samples_mask)
    # print(xy)
    plt.plot(
        xy_[:, 0],
        xy_[:, 1],
        "o",
        markerfacecolor=tuple(col),
        markeredgecolor="k",
        markersize=14,
    )
    # print(class_member_mask & ~core_samples_mask)
    # print(xy)
    xy_outliers = X[class_member_mask & ~core_samples_mask]
    print(xy_outliers)
    plt.plot(
        xy_outliers[:, 0],
        xy_outliers[:, 1],
        "o",
        markerfacecolor=tuple(col),
        markeredgecolor="k",
        markersize=6,
    )
    

plt.title(f"Number of clusters: {n_clusters_}")
plt.show() 
[[35.9 61.5]
 [ 0.  69.7]
 [ 0.  70.1]
 [ 0.  71. ]]
[[ 41.3  60.7]
 [ 38.6  62.9]
 [ 41.4  63.3]
 [ 40.9  67.7]
 [  0.   73.6]
 [ 35.4  78. ]
 [ 37.2  78.3]
 [ 10.8 117.5]]
No description has been provided for this image

With a little bit of tuning, we have spotted some possible outliers.

In [1]:
outliers = df.loc[indices_falsos]
outliers
Out[1]:
transaction_date age distance_to_the_nearest_MRT_station number_of_convenience_stores lat lon price category color weights
395 2013-01-01 41.3 124.9912 6 24.96674 121.54039 60.7 Super Cara darkred 396
401 2013-01-01 38.6 804.6897 4 24.97838 121.53477 62.9 Super Cara darkred 402
404 2013-01-01 41.4 281.2050 8 24.97345 121.54093 63.3 Super Cara darkred 405
406 2013-01-01 40.9 122.3619 8 24.96756 121.54230 67.7 Super Cara darkred 407
410 2013-05-01 0.0 292.9978 6 24.97744 121.54458 73.6 Super Cara darkred 411
411 2013-06-01 35.4 318.5292 9 24.97071 121.54069 78.0 Super Cara darkred 412
412 2013-03-01 37.2 186.5101 9 24.97703 121.54265 78.3 Super Cara darkred 413
413 2013-03-01 10.8 252.5822 1 24.97460 121.53046 117.5 Super Cara darkred 414

Here we have located the elements of our dataframe that might have caused some alteration to our covariance value. Let's drop these items to see if we can get a more balanced covariance, that measures better the general nature of the data, leaving aside special cases.

In [1]:
df.drop(axis=0, index=indices_falsos, inplace=True)
casas_nuevas.drop(axis=0, index=indices_falsos, inplace=True)
In [1]:
new_cov = df.copy()
new_cov.drop(columns=['transaction_date', 'category', 'color', 'weights'], axis=1, inplace=True)
print(new_cov.corr().to_string())
                                          age  distance_to_the_nearest_MRT_station  number_of_convenience_stores       lat       lon     price
age                                  1.000000                             0.040930                      0.021610  0.052362 -0.060542 -0.288220
distance_to_the_nearest_MRT_station  0.040930                             1.000000                     -0.602641 -0.591282 -0.806179 -0.708534
number_of_convenience_stores         0.021610                            -0.602641                      1.000000  0.447611  0.445918  0.606093
lat                                  0.052362                            -0.591282                      0.447611  1.000000  0.412476  0.582044
lon                                 -0.060542                            -0.806179                      0.445918  0.412476  1.000000  0.559409
price                               -0.288220                            -0.708534                      0.606093  0.582044  0.559409  1.000000
In [1]:
print(casas_nuevas.corr().to_string())
                                          age  distance_to_the_nearest_MRT_station  number_of_convenience_stores       lat       lon     price
age                                  1.000000                             0.303168                      0.042480  0.167210 -0.119470 -0.526540
distance_to_the_nearest_MRT_station  0.303168                             1.000000                     -0.253274  0.210262 -0.095022 -0.404114
number_of_convenience_stores         0.042480                            -0.253274                      1.000000  0.070082  0.687840  0.223867
lat                                  0.167210                             0.210262                      0.070082  1.000000 -0.074338  0.102738
lon                                 -0.119470                            -0.095022                      0.687840 -0.074338  1.000000  0.336239
price                               -0.526540                            -0.404114                      0.223867  0.102738  0.336239  1.000000
In [1]:
import plotly.graph_objects as go


muy_baratas.reset_index(drop=True, inplace=True)
muy_baratas['loc'] = muy_baratas.index.values + 1
muy_baratas.sort_values(by='price', axis=0, inplace=True, ascending=True)

print(muy_baratas)
dimensions = list([ dict(range=(muy_baratas['loc'].max(), 1),
                       tickvals = muy_baratas['loc'], ticktext = muy_baratas['category'],
                       label='Categoría', values=muy_baratas['loc']),
                    
                    
                    dict(range=(0,muy_baratas['price'].max()), label='Precio', values=muy_baratas['price']),
                    dict(range=(0,muy_baratas['number_of_convenience_stores'].max()), label='Tiendas de conveniencia', values=muy_baratas['number_of_convenience_stores']),
                    dict(range=(0,muy_baratas['distance_to_the_nearest_MRT_station'].max()), label='Distancia a la est. mas cercana', values=muy_baratas['distance_to_the_nearest_MRT_station']),
                    dict(range=(0,muy_baratas['age'].max()), label='Edad', values=muy_baratas['age']),
                  ])
fig = go.Figure(data= go.Parcoords(line = dict(color =0, colorscale = 'agsunset'), dimensions = dimensions))
fig.update_layout(width=800, height=400,margin=dict(l=150, r=60, t=60, b=40))
fig.show("png")
   number_of_convenience_stores  distance_to_the_nearest_MRT_station  \
0                      1.240964                          2804.101812   
1                      2.879518                          1354.403169   
2                      4.771084                           542.854812   
3                      5.548780                           450.979636   
4                      6.048193                           259.463642   

       price        age      category  loc  
0  19.802410  19.500000  Super Barata    1  
1  29.953012  17.818072        Barata    2  
2  38.563855  23.844578         Media    3  
3  44.628049  16.601220          Cara    4  
4  57.033735  10.785542    Super Cara    5  
No description has been provided for this image

Here we recalculated the covariance in the hole dataset and the selection of 1 km radius, dropping the outliers to have a more precise info. The covariance between price and age in the downtown has gone from -0.32 to -0.52!

This Notebook exemplifies some common tools in data analysis, and some core concepts of the statistics discipline.