import folium
import random
import pandas as pd
import numpy as np
from folium.plugins import HeatMap, HeatMapWithTime
from ucimlrepo import fetch_ucirepo, list_available_datasets, dotdict, fetch
from datetime import datetime
import math
real_state_valuation = fetch_ucirepo(id=477)
df = real_state_valuation.data.original.copy()
def date_transformation(date):
year = int(date.split('.')[0])
month = int(date.split('.')[1])
day = month
proportion = 12 / 1000
converted_month = math.floor(proportion * month)
if converted_month == 0:
converted_month = 1
date = datetime(year=year, month=converted_month, day=1)
return date
df['X1 transaction date'] = df['X1 transaction date'].apply(lambda date: date_transformation(str(date)))
df.drop(columns=['No'], inplace=True)
df['X1 transaction date'] = pd.to_datetime(df['X1 transaction date'])
df.sort_values(by='X1 transaction date', axis=0, inplace=True)
df.reset_index(drop=True, inplace=True)
prices = df['Y house price of unit area'].to_list()
color_options = {
"red",
"darkred",
"lightred",
"orange",
"beige",
"green",
"darkgreen",
"lightgreen",
"blue",
"darkblue",
"cadetblue",
"lightblue",
"purple",
"darkpurple",
"pink",
"white",
"gray",
"lightgray",
"black",
}
min_price = min(prices)
max_price = max(prices)
intermediate_price = np.median(prices)
num_categorias = 5
df.sort_values(by='Y house price of unit area', ascending=True, inplace=True)
df.reset_index(inplace=True, drop=True, )
# Categorizar los números en intervalos aproximadamente iguales
intervalos = [min_price, intermediate_price, max_price, np.inf]
etiquetas = ['Super Barata', 'Barata', 'Media', 'Cara', 'Super Cara']
colores = ['lightgreen', 'green', 'darkgreen', 'red', 'darkred']
calor = len(df.index.values)
pesos = df.index.values
# categorias = pd.cut(prices, bins=intervalos, labels=etiquetas)
df = df.assign(categorias=pd.qcut(df['Y house price of unit area'], q=num_categorias, labels=etiquetas))
df = df.assign(colores_zone=pd.qcut(df['Y house price of unit area'], q=num_categorias, labels=colores))
df['weights'] = (pesos + 1)
lat = df['X5 latitude'].to_list()
lon = df['X6 longitude'].to_list()
max_lat = max(lat) + 0.007
min_lat = min(lat) - 0.007
max_lon = max(lon) + 0.007
min_lon = min(lon) - 0.007
m = folium.Map(location=[24.96515, 121.53737], zoom_start=13,
control_scale=True,
scrollWheelZoom=False)
feature_gp1 = folium.FeatureGroup(name="icons")
feature_gp2 = folium.FeatureGroup(name="heat zone")
for i in df.index:
latitud = df.loc[df.index == i]['X5 latitude'].to_list()[0]
longitud = df.loc[df.index == i]['X6 longitude'].to_list()[0]
peso = df.loc[df.index == i]['weights'].to_list()[0]
price = df.loc[df.index == i]['Y house price of unit area'].to_list()[0]
age = df.loc[df.index == i]['X2 house age'].to_list()[0]
colores = df.loc[df.index == i]['colores_zone'].to_list()[0]
icon_color = ["#" + ''.join([random.choice('ABCDEF0123456789') for i in range(6)])]
iconos = folium.Icon(
color=colores, prefix='fa', icon='fa-house',
# icon_color=icon_color
)
marker = folium.Marker(
location=[latitud, longitud],
# coordinates for the marker (Earth Lab at CU Boulder)
popup=(latitud, longitud),
# pop-up label for the marker
# icon=number_DivIcon(col_hex[num], num),
icon=iconos,
tooltip=f'Antiquity: {age} years')
marker.add_to(feature_gp1)
ls = folium.PolyLine(
locations=[[max_lat, max_lon],
[max_lat, min_lon],
[min_lat, min_lon],
[min_lat, max_lon],
[max_lat, max_lon]], color="purple"
)
df.sort_values(by='weights', ascending=False)
my_list = [[df[df.index == i]['X5 latitude'].to_list()[0], df[df.index == i]['X6 longitude'].to_list()[0],
df[df.index == i]['weights'].to_list()[0],
] for i in df.index.values]
feature_gp2.add_child(HeatMap(data=my_list, radius=26))
# print(my_list)
# feature_gp2.add_child(HeatMap(data=my_list[:5] , radius=25, blur = 1, min_opacity = 0.5, ))
# m = folium.Map((24.96172, 121.53812), zoom_start=12,)
ls.add_child(folium.Popup("outline Popup on Polyline"))
m.add_child(ls)
fg = folium.FeatureGroup(show=False)
feature_gp2.add_to(m)
feature_gp1.add_to(m)
folium.LayerControl().add_to(m)
# m = folium.Map((24.96515, 121.53737), zoom_start=14, tiles=None, control_scale=True,
# scrollWheelZoom=False)
# HeatMap(
# # make five dots with different weights: 1, 2, 3, 4 and 5
# data=my_list[:50], radius=25, blur=10, min_opacity=0).add_to(m)
folium.LatLngPopup().add_to(m)
m
Here, we've already added our data to a folium map, and we have used the available colors for a marker to represent in a intuitive way the prices of each house. Also, a heat map has been added.
import osmnx as ox
from geopy.geocoders import Photon
geolocator = Photon(user_agent="measurements")
latitud = str(df.loc[df['X2 house age'] == 13.9]['X5 latitude'].to_list()[0])
longitud = str(df.loc[df['X2 house age'] == 13.9]['X6 longitude'].to_list()[0])
location = geolocator.reverse(latitud + "," + longitud)
address = location.raw['properties']['city']
print(address)
# ox.config(use_cache=True, log_console=True)
ox.settings.log_console = True
ox.settings.use_cache = True
# define the place query
query = {'city': address}
# get the boundaries of the place
gdf = ox.geocode_to_gdf(query)
gdf.plot()
新北市
<Axes: >
Here, even though we know the location of our data by the source, if we are handling unknown coordinates, this code tells us the city and generates a Geodataframe to work with.
df.columns = ['transaction_date', 'age', 'distance_to_the_nearest_MRT_station', 'number_of_convenience_stores', 'lat',
'lon', 'price', 'category', 'color', 'weights']
# print(df.to_string())
muy_baratas = df.groupby(by='category', observed=True).agg(
{'number_of_convenience_stores': 'mean', 'distance_to_the_nearest_MRT_station': 'mean', 'price': 'mean',
'age': 'mean'})
muy_baratas['category'] = muy_baratas.index.values
print(muy_baratas.to_string())
import plotly.express as px
# df = px.data.gapminder()
fig = px.scatter(muy_baratas, x="age", y="price",
size="number_of_convenience_stores",
hover_name="category", log_x=True, size_max=100, color='category')
fig.show('png')
number_of_convenience_stores distance_to_the_nearest_MRT_station price age category category Super Barata 1.240964 2804.101812 19.802410 19.500000 Super Barata Barata 2.879518 1354.403169 29.953012 17.818072 Barata Media 4.771084 542.854812 38.563855 23.844578 Media Cara 5.548780 450.979636 44.628049 16.601220 Cara Super Cara 6.048193 259.463642 57.033735 10.785542 Super Cara
Here we've already grouped the data by every category and calculated the mean values of each parameter, we can see that the most expensive houses are the ones that are located in the city, and the mos cheap are the ones in the periphery.
-Expensive houses are newer than the cheaper houses
-Even though a house can be recently build, the price depends hugely on its location
-This also tell us that it is more likely that a house that has a near convenience store is more expensive
Let's make a more in depth analysis and check the houses in the downton, where most of the data is located.
import numpy as np
import math
import scipy.spatial.distance as distance
import geopy.distance
# Supongamos que tienes una lista de coordenadas (latitud, longitud)
# points = [(lat1, lon1), (lat2, lon2), ...]
points = list(zip(df['lat'], df['lon']))
# df['geometry'] = [Point(lat, lon) for lon, lat in zip(df['lat'], df['lon'])]
# 1. Calcular la matriz de distancias
def get_distance(point1, point2):
R = 6371 # Radio de la Tierra en kilómetros
lat1, lon1 = np.radians(point1)
lat2, lon2 = np.radians(point2)
dlon = lon2 - lon1
dlat = lat2 - lat1
a = np.sin(dlat / 2) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2) ** 2
c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
return R * c
all_points = np.array(points)
dist_matrix = distance.cdist(all_points, all_points, get_distance)
center_index = np.argmin(dist_matrix.sum(axis=1))
center_point = points[center_index]
radius = 1000
from haversine import haversine, Unit
def calcular_nuevas_coordenadas(latitud, longitud, distancia_vertical, distancia_horizontal):
nueva_latitud = latitud + (distancia_vertical / 111.32) # Aproximadamente 111.32 km por grado de latitud
nueva_longitud = longitud + (distancia_horizontal / (100))
return nueva_latitud, nueva_longitud
centro_circunferencia = (center_point[0], center_point[1])
lat_axis, lon_axis = calcular_nuevas_coordenadas(center_point[0], center_point[1], distancia_vertical=1,
distancia_horizontal=1)
lat_axis_ngtv, lon_axis_ngtv = calcular_nuevas_coordenadas(center_point[0], center_point[1], distancia_vertical=-1,
distancia_horizontal=-1)
x_limit = lat_axis - center_point[0]
y_limit = lon_axis - center_point[1]
selected_points = df.copy()
for i in selected_points.index.values:
latitud = selected_points.loc[selected_points.index == i]['lat'].to_list()[0]
longitud = selected_points.loc[selected_points.index == i]['lon'].to_list()[0]
if geopy.distance.geodesic((latitud, longitud), (center_point[0], center_point[1])).kilometers > 1:
selected_points.drop(axis=0, index=i, inplace=True)
else:
continue
df.columns = ['transaction_date', 'age', 'distance_to_the_nearest_MRT_station', 'number_of_convenience_stores', 'lat',
'lon', 'price', 'category', 'color', 'weights']
m = folium.Map(location=[24.96515, 121.53737], zoom_start=13,
control_scale=True,
scrollWheelZoom=False)
for i in selected_points.index:
latitud = selected_points.loc[selected_points.index == i]['lat'].to_list()[0]
longitud = selected_points.loc[selected_points.index == i]['lon'].to_list()[0]
peso = selected_points.loc[selected_points.index == i]['weights'].to_list()[0]
price = selected_points.loc[selected_points.index == i]['price'].to_list()[0]
age = selected_points.loc[selected_points.index == i]['age'].to_list()[0]
colores = selected_points.loc[selected_points.index == i]['color'].to_list()[0]
# icon_color = ["#" + ''.join([random.choice('ABCDEF0123456789') for i in range(6)])]
iconos = folium.Icon(
color=colores, prefix='fa', icon='fa-house',
)
marker = folium.Marker(
location=[latitud, longitud],
popup=peso,
icon=iconos,
tooltip=f'Antiquity: {age} years')
marker.add_to(m)
iconos = folium.Icon(
color='pink', prefix='fa', icon='fa-house', )
folium.Circle(
location=[center_point[0], center_point[1]],
radius=radius,
fill_color="cornflowerblue",
fill=False,
popup="{} km".format(radius),
).add_to(m)
m
Here we've calculated the central point of our coordinates, (Ergo, the point where the distance to each point is minnimal) and reduced our data to a certain circumference, to grasp more data where we have more density of houses.
selected_points_agg = selected_points.groupby(by='category', observed=True).agg(
{'number_of_convenience_stores': 'mean', 'distance_to_the_nearest_MRT_station': 'mean', 'price': 'mean',
'age': 'mean'})
print(selected_points_agg.to_string())
number_of_convenience_stores distance_to_the_nearest_MRT_station price age category Barata 3.950000 446.603880 31.370000 26.170000 Media 5.070175 443.935544 38.726316 25.689474 Cara 6.070175 317.435891 44.433333 17.573684 Super Cara 5.983051 252.142486 58.230508 10.827119
We can see that if we reduce our data to a 1 km radius, with the same aggregation of before, we realize that in downton, the most predominant parameter to define the price is not the number of convenience stores nearby, is the distance to the nearest metro station, and the age of the house.
Let's look to a more specific insight.
casas_nuevas = selected_points.copy()
casas_nuevas.drop(columns=['transaction_date', 'category', 'color', 'weights'], axis=1, inplace=True)
print(casas_nuevas.corr().to_string())
age distance_to_the_nearest_MRT_station number_of_convenience_stores lat lon price age 1.000000 0.273330 0.074861 0.138400 -0.109839 -0.326439 distance_to_the_nearest_MRT_station 0.273330 1.000000 -0.260047 0.225971 -0.108378 -0.343903 number_of_convenience_stores 0.074861 -0.260047 1.000000 0.056987 0.690952 0.167605 lat 0.138400 0.225971 0.056987 1.000000 -0.078110 0.084663 lon -0.109839 -0.108378 0.690952 -0.078110 1.000000 0.204144 price -0.326439 -0.343903 0.167605 0.084663 0.204144 1.000000
This is a super useful tool, one of the cores of statistics, the correlation or covariance tells us how strongly bonded are two variables, and in which manner, if the correlation is 1, it means that the variables have an increasing relation, if one increases the other does, and viceversa with -1, and a 0 means that they behaviour can't be related in any way.
We will see these relations in a more visual way, by using a matrix where both axis correspond to each variable, and every intersection shows a scatter plot of the relation between these two variables.
import seaborn as sns
import matplotlib.pyplot as plt
sns.pairplot(selected_points, diag_kind='kde')
plt.show()
We can see that the tables of every element of the matrix where the same variables intersect, are the distributions of that variable, ergo, in which values is mostly located the data of our variable.
Also, where the data is more scattered, is where the relation between those variables weaker, where the data tends to grow or decrease on the Y axis as we move along the X axis, is where our correlation tends to 1 or -1.
import seaborn as sns
import matplotlib.pyplot as plt
sns.histplot(selected_points['age'], bins=20, )
plt.xlabel('Edad de la casa')
plt.ylabel('Frecuencia')
plt.title('Distribución de la edad de las casas')
plt.show()
Here we can see a basic view of the distribution of the age of the house that happens to describe the data distribution of every scatter of our age variable along the X axis.
sns.scatterplot(x='age', y='price', data=selected_points)
plt.xlabel('Edad')
plt.ylabel(
'Precio de la casa'
)
plt.title('Relación entre edad y precio')
plt.show()
Finally, we will se if we can make some forecasting about the prices as the age goes on, and find what cases made our correlation not so strong. Let's sort our dataframe by age. (It's an important factor the covariance between age and price, which is -0.32, meaning that if the age grows, the price is likely to go down)
from sklearn.cluster import DBSCAN
X = [[df[df.index == i]['age'].to_list()[0], df[df.index == i]['price'].to_list()[0],
] for i in df.index.values]
X = np.array(X)
db = DBSCAN(eps=9, min_samples=10).fit(X)
labels = db.labels_
# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)
# print("Number of clusters: %d" % n_clusters_)
# print("Number of noise points: %d" % n_noise_)
unique_labels = set(labels)
core_samples_mask = np.zeros_like(labels, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
colors = [plt.cm.Spectral(each) for each in np.linspace(0, 1, len(unique_labels))]
indices_falsos = []
# Itera sobre la lista de booleanos
for i, valor in enumerate(list(labels)):
if valor:
indices_falsos.append(i)
for k, col in zip(unique_labels, colors):
if k == -1:
# Black used for noise.
col = [0, 0, 0, 1]
class_member_mask = labels == k
xy_ = X[class_member_mask & core_samples_mask]
# print(class_member_mask & core_samples_mask)
# print(xy)
plt.plot(
xy_[:, 0],
xy_[:, 1],
"o",
markerfacecolor=tuple(col),
markeredgecolor="k",
markersize=14,
)
# print(class_member_mask & ~core_samples_mask)
# print(xy)
xy_outliers = X[class_member_mask & ~core_samples_mask]
print(xy_outliers)
plt.plot(
xy_outliers[:, 0],
xy_outliers[:, 1],
"o",
markerfacecolor=tuple(col),
markeredgecolor="k",
markersize=6,
)
plt.title(f"Number of clusters: {n_clusters_}")
plt.show()
[[35.9 61.5] [ 0. 69.7] [ 0. 70.1] [ 0. 71. ]] [[ 41.3 60.7] [ 38.6 62.9] [ 41.4 63.3] [ 40.9 67.7] [ 0. 73.6] [ 35.4 78. ] [ 37.2 78.3] [ 10.8 117.5]]
With a little bit of tuning, we have spotted some possible outliers.
outliers = df.loc[indices_falsos]
outliers
transaction_date | age | distance_to_the_nearest_MRT_station | number_of_convenience_stores | lat | lon | price | category | color | weights | |
---|---|---|---|---|---|---|---|---|---|---|
395 | 2013-01-01 | 41.3 | 124.9912 | 6 | 24.96674 | 121.54039 | 60.7 | Super Cara | darkred | 396 |
401 | 2013-01-01 | 38.6 | 804.6897 | 4 | 24.97838 | 121.53477 | 62.9 | Super Cara | darkred | 402 |
404 | 2013-01-01 | 41.4 | 281.2050 | 8 | 24.97345 | 121.54093 | 63.3 | Super Cara | darkred | 405 |
406 | 2013-01-01 | 40.9 | 122.3619 | 8 | 24.96756 | 121.54230 | 67.7 | Super Cara | darkred | 407 |
410 | 2013-05-01 | 0.0 | 292.9978 | 6 | 24.97744 | 121.54458 | 73.6 | Super Cara | darkred | 411 |
411 | 2013-06-01 | 35.4 | 318.5292 | 9 | 24.97071 | 121.54069 | 78.0 | Super Cara | darkred | 412 |
412 | 2013-03-01 | 37.2 | 186.5101 | 9 | 24.97703 | 121.54265 | 78.3 | Super Cara | darkred | 413 |
413 | 2013-03-01 | 10.8 | 252.5822 | 1 | 24.97460 | 121.53046 | 117.5 | Super Cara | darkred | 414 |
Here we have located the elements of our dataframe that might have caused some alteration to our covariance value. Let's drop these items to see if we can get a more balanced covariance, that measures better the general nature of the data, leaving aside special cases.
df.drop(axis=0, index=indices_falsos, inplace=True)
casas_nuevas.drop(axis=0, index=indices_falsos, inplace=True)
new_cov = df.copy()
new_cov.drop(columns=['transaction_date', 'category', 'color', 'weights'], axis=1, inplace=True)
print(new_cov.corr().to_string())
age distance_to_the_nearest_MRT_station number_of_convenience_stores lat lon price age 1.000000 0.040930 0.021610 0.052362 -0.060542 -0.288220 distance_to_the_nearest_MRT_station 0.040930 1.000000 -0.602641 -0.591282 -0.806179 -0.708534 number_of_convenience_stores 0.021610 -0.602641 1.000000 0.447611 0.445918 0.606093 lat 0.052362 -0.591282 0.447611 1.000000 0.412476 0.582044 lon -0.060542 -0.806179 0.445918 0.412476 1.000000 0.559409 price -0.288220 -0.708534 0.606093 0.582044 0.559409 1.000000
print(casas_nuevas.corr().to_string())
age distance_to_the_nearest_MRT_station number_of_convenience_stores lat lon price age 1.000000 0.303168 0.042480 0.167210 -0.119470 -0.526540 distance_to_the_nearest_MRT_station 0.303168 1.000000 -0.253274 0.210262 -0.095022 -0.404114 number_of_convenience_stores 0.042480 -0.253274 1.000000 0.070082 0.687840 0.223867 lat 0.167210 0.210262 0.070082 1.000000 -0.074338 0.102738 lon -0.119470 -0.095022 0.687840 -0.074338 1.000000 0.336239 price -0.526540 -0.404114 0.223867 0.102738 0.336239 1.000000
import plotly.graph_objects as go
muy_baratas.reset_index(drop=True, inplace=True)
muy_baratas['loc'] = muy_baratas.index.values + 1
muy_baratas.sort_values(by='price', axis=0, inplace=True, ascending=True)
print(muy_baratas)
dimensions = list([ dict(range=(muy_baratas['loc'].max(), 1),
tickvals = muy_baratas['loc'], ticktext = muy_baratas['category'],
label='CategorÃa', values=muy_baratas['loc']),
dict(range=(0,muy_baratas['price'].max()), label='Precio', values=muy_baratas['price']),
dict(range=(0,muy_baratas['number_of_convenience_stores'].max()), label='Tiendas de conveniencia', values=muy_baratas['number_of_convenience_stores']),
dict(range=(0,muy_baratas['distance_to_the_nearest_MRT_station'].max()), label='Distancia a la est. mas cercana', values=muy_baratas['distance_to_the_nearest_MRT_station']),
dict(range=(0,muy_baratas['age'].max()), label='Edad', values=muy_baratas['age']),
])
fig = go.Figure(data= go.Parcoords(line = dict(color =0, colorscale = 'agsunset'), dimensions = dimensions))
fig.update_layout(width=800, height=400,margin=dict(l=150, r=60, t=60, b=40))
fig.show("png")
number_of_convenience_stores distance_to_the_nearest_MRT_station \ 0 1.240964 2804.101812 1 2.879518 1354.403169 2 4.771084 542.854812 3 5.548780 450.979636 4 6.048193 259.463642 price age category loc 0 19.802410 19.500000 Super Barata 1 1 29.953012 17.818072 Barata 2 2 38.563855 23.844578 Media 3 3 44.628049 16.601220 Cara 4 4 57.033735 10.785542 Super Cara 5
Here we recalculated the covariance in the hole dataset and the selection of 1 km radius, dropping the outliers to have a more precise info. The covariance between price and age in the downtown has gone from -0.32 to -0.52!
This Notebook exemplifies some common tools in data analysis, and some core concepts of the statistics discipline.