usse/funda-scraper/funda_scraper/preprocess.py

195 lines
5.9 KiB
Python

"""Preprocess raw data scraped from Funda"""
from typing import Any
import pandas as pd
from funda_scraper.config.core import config
from datetime import datetime
from datetime import timedelta
def clean_price(x: str) -> int:
"""Clean the 'price' and transform from string to integer."""
try:
return int(str(x).split(" ")[1].replace(",", ""))
except ValueError:
return 0
except IndexError:
return 0
def clean_year(x: str) -> int:
"""Clean the 'year' and transform from string to integer"""
if len(x) == 4:
return int(x)
elif x.find("-") != -1:
return int(x.split("-")[0])
elif x.find("before") != -1:
return int(x.split(" ")[1])
else:
return 0
def clean_living_area(x: str) -> int:
"""Clean the 'living_area' and transform from string to integer"""
try:
return int(str(x).replace(",", "").split("")[0])
except ValueError:
return 0
except IndexError:
return 0
def find_n_room(x: str) -> int:
"""Find the number of rooms from a string"""
if x.find("room") != -1:
return int(str(x).split("room")[0].strip())
else:
return 0
def find_n_bedroom(x: str) -> int:
"""Find the number of bedrooms from a string"""
if x.find("bedroom") != -1:
return int(x.split(" ")[2].replace("(", ""))
else:
return 0
def find_n_bathroom(x: str) -> int:
"""Find the number of bathrooms from a string"""
if x.find("bathroom") != -1:
return int(str(x).split("bathroom")[0].strip())
else:
return 0
def map_dutch_month(x: str) -> str:
"""Map the month from Dutch to English."""
month_mapping = {
"januari": "January",
"februari": "February",
"maart": "March",
"mei": "May",
"juni": "June",
"juli": "July",
"augustus": "August",
"oktober": "October",
}
for k, v in month_mapping.items():
if x.find(k) != -1:
x = x.replace(k, v)
return x
def get_neighbor(x: str) -> str:
"""Find the neighborhood name."""
city = x.split("/")[0].replace("-", " ")
return x.lower().split(city)[-1]
def clean_energy_label(x: str) -> str:
"""Clean the energy labels."""
try:
x = x.split(" ")[0]
if x.find("A+") != -1:
return ">A+"
else:
return x
except IndexError:
return x
def clean_list_date(x: str) -> Any:
"""Transform the date from string to datetime object."""
def delta_now(d):
t = timedelta(days=d)
return datetime.now() - t
try:
if x.find("") != -1 or x.find("na") != -1 or x.find("Indefinite duration") != -1:
return "na"
elif x.find("month") != -1:
return delta_now(int(x.split("month")[0].strip()[0]) * 30)
elif x.find("week") != -1:
return delta_now(int(x.split("month")[0].strip()[0]) * 7)
elif x.find("Today") != -1:
return delta_now(1)
elif x.find("day") != -1:
return delta_now(int(x.split("month")[0].strip()))
else:
return datetime.strptime(x, "%B %d, %Y")
except ValueError:
return "na"
def preprocess_data(df: pd.DataFrame, is_past: bool) -> pd.DataFrame:
"""
Clean the raw dataframe from scraping.
Indicate whether it includes historical data sicne the columns would be different.
:param df: raw dataframe from scraping
:param is_past: whether it scraped past data
:return: clean dataframe
"""
df = df.dropna()
keep_cols = config.keep_cols.selling_data
keep_cols_sold = keep_cols + config.keep_cols.sold_data
# Info
df["house_id"] = df["url"].apply(lambda x: int(x.split("/")[-2].split("-")[1]))
df["house_type"] = df["url"].apply(lambda x: x.split("/")[-2].split("-")[0])
df = df[df["house_type"].isin(["appartement", "huis"])]
# Price
price_col = "price_sold" if is_past else "price"
df["price"] = df[price_col].apply(clean_price)
df = df[df["price"] != 0]
df["living_area"] = df["living_area"].apply(clean_living_area)
df = df[df["living_area"] != 0]
df["price_m2"] = round(df.price / df.living_area, 1)
# Location
df["zip"] = df["zip_code"].apply(lambda x: x[:4])
# House layout
df["room"] = df["num_of_rooms"].apply(find_n_room)
df["bedroom"] = df["num_of_rooms"].apply(find_n_bedroom)
df["bathroom"] = df["num_of_bathrooms"].apply(find_n_bathroom)
df["energy_label"] = df["energy_label"].apply(clean_energy_label)
df["has_balcony"] = df["exteriors"].apply(
lambda x: 1 if str(x).find("Balcony present") != -1 else 0
)
df["has_garden"] = df["exteriors"].apply(
lambda x: 1 if str(x).find("garden") != -1 else 0
)
# Time
df["year_built"] = df["year"].apply(clean_year).astype(int)
df["house_age"] = 2023 - df["year_built"]
if not is_past:
# Only check current data
df["date_list"] = df.listed_since.apply(clean_list_date)
df = df[df["date_list"] != "na"]
df["date_list"] = pd.to_datetime(df["date_list"])
else:
# Only check past data
df = df[(df["date_sold"] != "na") & (df["date_list"] != "na")]
df["date_sold"] = df["date_sold"].apply(map_dutch_month)
df = df.dropna()
df["date_list"] = pd.to_datetime(df["date_list"])
df["date_sold"] = pd.to_datetime(df["date_sold"])
df["ym_sold"] = df["date_sold"].apply(lambda x: x.to_period("M").to_timestamp())
df["year_sold"] = df["date_sold"].apply(lambda x: x.year)
# Term
df["term_days"] = df["date_sold"] - df["date_list"]
df["term_days"] = df["term_days"].apply(lambda x: x.days)
keep_cols = keep_cols_sold
df["ym_list"] = df["date_list"].apply(lambda x: x.to_period("M").to_timestamp())
df["year_list"] = df["date_list"].apply(lambda x: x.year)
return df[keep_cols].reset_index(drop=True)