usse/scrape/funda-scraper/funda_scraper/preprocess.py

"""Preprocess raw data scraped from Funda"""
import re
from datetime import datetime, timedelta
from typing import Union

import pandas as pd
from dateutil.parser import parse

from funda_scraper.config.core import config


def clean_price(x: str) -> int:
    """Clean the 'price' and transform from string to integer."""
    try:
        return int(str(x).split(" ")[1].replace(".", ""))
    except ValueError:
        return 0
    except IndexError:
        return 0


def clean_year(x: str) -> int:
    """Clean the 'year' and transform from string to integer"""
    if len(x) == 4:
        return int(x)
    elif x.find("-") != -1:
        return int(x.split("-")[0])
    elif x.find("before") != -1:
        return int(x.split(" ")[1])
    else:
        return 0


def clean_living_area(x: str) -> int:
    """Clean the 'living_area' and transform from string to integer"""
    try:
        return int(str(x).replace(",", "").split(" m²")[0])
    except ValueError:
        return 0
    except IndexError:
        return 0


def find_keyword_from_regex(x: str, pattern: str) -> int:
    result = re.findall(pattern, x)
    if len(result) > 0:
        result = "".join(result[0])
        x = result.split(" ")[0]
    else:
        x = 0
    return int(x)


def find_n_room(x: str) -> int:
    """Find the number of rooms from a string"""
    pattern = r"(\d{1,2}\s{1}kamers{0,1})|(\d{1,2}\s{1}rooms{0,1})"
    return find_keyword_from_regex(x, pattern)


def find_n_bedroom(x: str) -> int:
    """Find the number of bedrooms from a string"""
    pattern = r"(\d{1,2}\s{1}slaapkamers{0,1})|(\d{1,2}\s{1}bedrooms{0,1})"
    return find_keyword_from_regex(x, pattern)


def find_n_bathroom(x: str) -> int:
    """Find the number of bathrooms from a string"""
    pattern = r"(\d{1,2}\s{1}badkamers{0,1})|(\d{1,2}\s{1}bathrooms{0,1})"
    return find_keyword_from_regex(x, pattern)


def map_dutch_month(x: str) -> str:
    """Map the month from Dutch to English."""
    month_mapping = {
        "januari": "January",
        "februari": "February",
        "maart": "March",
        "mei": "May",
        "juni": "June",
        "juli": "July",
        "augustus": "August",
        "oktober": "October",
    }
    for k, v in month_mapping.items():
        if x.find(k) != -1:
            x = x.replace(k, v)
    return x


def get_neighbor(x: str) -> str:
    """Find the neighborhood name."""
    city = x.split("/")[0].replace("-", " ")
    return x.lower().split(city)[-1]


def clean_energy_label(x: str) -> str:
    """Clean the energy labels."""
    try:
        x = x.split(" ")[0]
        if x.find("A+") != -1:
            x = ">A+"
        return x
    except IndexError:
        return x


def clean_list_date(x: str) -> Union[datetime, str]:
    """Transform the date from string to datetime object."""

    x = x.replace("weken", "week")
    x = x.replace("maanden", "month")
    x = x.replace("Vandaag", "Today")
    x = x.replace("+", "")
    x = map_dutch_month(x)

    def delta_now(d: int):
        t = timedelta(days=d)
        return datetime.now() - t

    weekdays_dict = {
        "maandag": "Monday",
        "dinsdag": "Tuesday",
        "woensdag": "Wednesday",
        "donderdag": "Thursday",
        "vrijdag": "Friday",
        "zaterdag": "Saturday",
        "zondag": "Sunday",
    }

    try:
        if x.lower() in weekdays_dict.keys():
            date_string = weekdays_dict.get(x.lower())
            parsed_date = parse(date_string, fuzzy=True)
            delta = datetime.now().weekday() - parsed_date.weekday()
            x = delta_now(delta)

        elif x.find("month") != -1:
            x = delta_now(int(x.split("month")[0].strip()[0]) * 30)
        elif x.find("week") != -1:
            x = delta_now(int(x.split("week")[0].strip()[0]) * 7)
        elif x.find("Today") != -1:
            x = delta_now(1)
        elif x.find("day") != -1:
            x = delta_now(int(x.split("day")[0].strip()))
        else:
            x = datetime.strptime(x, "%d %B %Y")
        return x

    except ValueError:
        return "na"


def preprocess_data(df: pd.DataFrame, is_past: bool) -> pd.DataFrame:
    """
    Clean the raw dataframe from scraping.
    Indicate whether the historical data is included since the columns would be different.

    :param df: raw dataframe from scraping
    :param is_past: whether it scraped past data
    :return: clean dataframe
    """

    df = df.dropna()
    keep_cols = config.keep_cols.selling_data
    keep_cols_sold = keep_cols + config.keep_cols.sold_data

    # Info
    df["house_id"] = df["url"].apply(lambda x: int(x.split("/")[-2].split("-")[1]))
    df["house_type"] = df["url"].apply(lambda x: x.split("/")[-2].split("-")[0])
    df = df[df["house_type"].isin(["appartement", "huis"])]

    # Price
    price_col = "price_sold" if is_past else "price"
    df["price"] = df[price_col].apply(clean_price)
    df = df[df["price"] != 0]
    df["living_area"] = df["living_area"].apply(clean_living_area)
    df = df[df["living_area"] != 0]
    df["price_m2"] = round(df.price / df.living_area, 1)

    # Location
    df["zip"] = df["zip_code"].apply(lambda x: x[:4])

    # House layout
    df["room"] = df["num_of_rooms"].apply(find_n_room)
    df["bedroom"] = df["num_of_rooms"].apply(find_n_bedroom)
    df["bathroom"] = df["num_of_bathrooms"].apply(find_n_bathroom)
    df["energy_label"] = df["energy_label"].apply(clean_energy_label)

    # Time
    df["year_built"] = df["year"].apply(clean_year).astype(int)
    df["house_age"] = datetime.now().year - df["year_built"]

    # if is_past:
    #     # Only check past data
    #     df = df[(df["date_sold"] != "na") & (df["date_list"] != "na")]
    #     df["date_list"] = df["date_list"].apply(clean_list_date)
    #     df["date_sold"] = df["date_sold"].apply(clean_list_date)
    #     df = df.dropna()
    #     df["date_list"] = pd.to_datetime(df["date_list"])
    #     df["date_sold"] = pd.to_datetime(df["date_sold"])
    #     df["ym_sold"] = df["date_sold"].apply(lambda x: x.to_period("M").to_timestamp())
    #     df["year_sold"] = df["date_sold"].apply(lambda x: x.year)
    #
    #     # Term
    #     df["term_days"] = df["date_sold"] - df["date_list"]
    #     df["term_days"] = df["term_days"].apply(lambda x: x.days)
    #     keep_cols = keep_cols_sold
    #     df["date_sold"] = df["date_sold"].dt.date
    #
    # else:
    #     # Only check current data
    #     df["date_list"] = df["listed_since"].apply(clean_list_date)
    #     df = df[df["date_list"] != "na"]
    #     df["date_list"] = pd.to_datetime(df["date_list"])

    # df["ym_list"] = df["date_list"].apply(lambda x: x.to_period("M").to_timestamp())
    # df["year_list"] = df["date_list"].apply(lambda x: x.year)
    # df["date_list"] = df["date_list"].dt.date

    return df[keep_cols].reset_index(drop=True)
update 2023-12-22 14:26:01 +00:00			`"""Preprocess raw data scraped from Funda"""`
			`import re`
			`from datetime import datetime, timedelta`
			`from typing import Union`

			`import pandas as pd`
			`from dateutil.parser import parse`

			`from funda_scraper.config.core import config`


			`def clean_price(x: str) -> int:`
			`"""Clean the 'price' and transform from string to integer."""`
			`try:`
			`return int(str(x).split(" ")[1].replace(".", ""))`
			`except ValueError:`
			`return 0`
			`except IndexError:`
			`return 0`


			`def clean_year(x: str) -> int:`
			`"""Clean the 'year' and transform from string to integer"""`
			`if len(x) == 4:`
			`return int(x)`
			`elif x.find("-") != -1:`
			`return int(x.split("-")[0])`
			`elif x.find("before") != -1:`
			`return int(x.split(" ")[1])`
			`else:`
			`return 0`


			`def clean_living_area(x: str) -> int:`
			`"""Clean the 'living_area' and transform from string to integer"""`
			`try:`
			`return int(str(x).replace(",", "").split(" m²")[0])`
			`except ValueError:`
			`return 0`
			`except IndexError:`
			`return 0`


			`def find_keyword_from_regex(x: str, pattern: str) -> int:`
			`result = re.findall(pattern, x)`
			`if len(result) > 0:`
			`result = "".join(result[0])`
			`x = result.split(" ")[0]`
			`else:`
			`x = 0`
			`return int(x)`


			`def find_n_room(x: str) -> int:`
			`"""Find the number of rooms from a string"""`
			`pattern = r"(\d{1,2}\s{1}kamers{0,1})\|(\d{1,2}\s{1}rooms{0,1})"`
			`return find_keyword_from_regex(x, pattern)`


			`def find_n_bedroom(x: str) -> int:`
			`"""Find the number of bedrooms from a string"""`
			`pattern = r"(\d{1,2}\s{1}slaapkamers{0,1})\|(\d{1,2}\s{1}bedrooms{0,1})"`
			`return find_keyword_from_regex(x, pattern)`


			`def find_n_bathroom(x: str) -> int:`
			`"""Find the number of bathrooms from a string"""`
			`pattern = r"(\d{1,2}\s{1}badkamers{0,1})\|(\d{1,2}\s{1}bathrooms{0,1})"`
			`return find_keyword_from_regex(x, pattern)`


			`def map_dutch_month(x: str) -> str:`
			`"""Map the month from Dutch to English."""`
			`month_mapping = {`
			`"januari": "January",`
			`"februari": "February",`
			`"maart": "March",`
			`"mei": "May",`
			`"juni": "June",`
			`"juli": "July",`
			`"augustus": "August",`
			`"oktober": "October",`
			`}`
			`for k, v in month_mapping.items():`
			`if x.find(k) != -1:`
			`x = x.replace(k, v)`
			`return x`


			`def get_neighbor(x: str) -> str:`
			`"""Find the neighborhood name."""`
			`city = x.split("/")[0].replace("-", " ")`
			`return x.lower().split(city)[-1]`


			`def clean_energy_label(x: str) -> str:`
			`"""Clean the energy labels."""`
			`try:`
			`x = x.split(" ")[0]`
			`if x.find("A+") != -1:`
			`x = ">A+"`
			`return x`
			`except IndexError:`
			`return x`


			`def clean_list_date(x: str) -> Union[datetime, str]:`
			`"""Transform the date from string to datetime object."""`

			`x = x.replace("weken", "week")`
			`x = x.replace("maanden", "month")`
			`x = x.replace("Vandaag", "Today")`
			`x = x.replace("+", "")`
			`x = map_dutch_month(x)`

			`def delta_now(d: int):`
			`t = timedelta(days=d)`
			`return datetime.now() - t`

			`weekdays_dict = {`
			`"maandag": "Monday",`
			`"dinsdag": "Tuesday",`
			`"woensdag": "Wednesday",`
			`"donderdag": "Thursday",`
			`"vrijdag": "Friday",`
			`"zaterdag": "Saturday",`
			`"zondag": "Sunday",`
			`}`

			`try:`
			`if x.lower() in weekdays_dict.keys():`
			`date_string = weekdays_dict.get(x.lower())`
			`parsed_date = parse(date_string, fuzzy=True)`
			`delta = datetime.now().weekday() - parsed_date.weekday()`
			`x = delta_now(delta)`

			`elif x.find("month") != -1:`
			`x = delta_now(int(x.split("month")[0].strip()[0]) * 30)`
			`elif x.find("week") != -1:`
			`x = delta_now(int(x.split("week")[0].strip()[0]) * 7)`
			`elif x.find("Today") != -1:`
			`x = delta_now(1)`
			`elif x.find("day") != -1:`
			`x = delta_now(int(x.split("day")[0].strip()))`
			`else:`
			`x = datetime.strptime(x, "%d %B %Y")`
			`return x`

			`except ValueError:`
			`return "na"`


			`def preprocess_data(df: pd.DataFrame, is_past: bool) -> pd.DataFrame:`
			`"""`
			`Clean the raw dataframe from scraping.`
			`Indicate whether the historical data is included since the columns would be different.`

			`:param df: raw dataframe from scraping`
			`:param is_past: whether it scraped past data`
			`:return: clean dataframe`
			`"""`

			`df = df.dropna()`
			`keep_cols = config.keep_cols.selling_data`
			`keep_cols_sold = keep_cols + config.keep_cols.sold_data`

			`# Info`
			`df["house_id"] = df["url"].apply(lambda x: int(x.split("/")[-2].split("-")[1]))`
			`df["house_type"] = df["url"].apply(lambda x: x.split("/")[-2].split("-")[0])`
			`df = df[df["house_type"].isin(["appartement", "huis"])]`

			`# Price`
			`price_col = "price_sold" if is_past else "price"`
			`df["price"] = df[price_col].apply(clean_price)`
			`df = df[df["price"] != 0]`
			`df["living_area"] = df["living_area"].apply(clean_living_area)`
			`df = df[df["living_area"] != 0]`
			`df["price_m2"] = round(df.price / df.living_area, 1)`

			`# Location`
			`df["zip"] = df["zip_code"].apply(lambda x: x[:4])`

			`# House layout`
			`df["room"] = df["num_of_rooms"].apply(find_n_room)`
			`df["bedroom"] = df["num_of_rooms"].apply(find_n_bedroom)`
			`df["bathroom"] = df["num_of_bathrooms"].apply(find_n_bathroom)`
			`df["energy_label"] = df["energy_label"].apply(clean_energy_label)`

			`# Time`
			`df["year_built"] = df["year"].apply(clean_year).astype(int)`
			`df["house_age"] = datetime.now().year - df["year_built"]`

			`# if is_past:`
			`# # Only check past data`
			`# df = df[(df["date_sold"] != "na") & (df["date_list"] != "na")]`
			`# df["date_list"] = df["date_list"].apply(clean_list_date)`
			`# df["date_sold"] = df["date_sold"].apply(clean_list_date)`
			`# df = df.dropna()`
			`# df["date_list"] = pd.to_datetime(df["date_list"])`
			`# df["date_sold"] = pd.to_datetime(df["date_sold"])`
			`# df["ym_sold"] = df["date_sold"].apply(lambda x: x.to_period("M").to_timestamp())`
			`# df["year_sold"] = df["date_sold"].apply(lambda x: x.year)`
			`#`
			`# # Term`
			`# df["term_days"] = df["date_sold"] - df["date_list"]`
			`# df["term_days"] = df["term_days"].apply(lambda x: x.days)`
			`# keep_cols = keep_cols_sold`
			`# df["date_sold"] = df["date_sold"].dt.date`
			`#`
			`# else:`
			`# # Only check current data`
			`# df["date_list"] = df["listed_since"].apply(clean_list_date)`
			`# df = df[df["date_list"] != "na"]`
			`# df["date_list"] = pd.to_datetime(df["date_list"])`

			`# df["ym_list"] = df["date_list"].apply(lambda x: x.to_period("M").to_timestamp())`
			`# df["year_list"] = df["date_list"].apply(lambda x: x.year)`
			`# df["date_list"] = df["date_list"].dt.date`

			`return df[keep_cols].reset_index(drop=True)`