added pricem2 stuff

This commit is contained in:
Eljakim Herrewijnen 2024-04-02 22:20:30 +02:00
parent 1c3376bf7b
commit 587f6fc950
8 changed files with 12 additions and 1 deletions

View File

@ -14,10 +14,12 @@ keep_cols:
- building_type - building_type
- price - price
- price_m2 - price_m2
- price_m2_total
- room - room
- bedroom - bedroom
- bathroom - bathroom
- living_area - living_area
- perceel_area
- energy_label - energy_label
- zip - zip
- address - address
@ -38,6 +40,7 @@ css_selector:
size: ".fd-m-right-xl--bp-m .fd-text--nowrap" size: ".fd-m-right-xl--bp-m .fd-text--nowrap"
year: ".fd-align-items-center~ .fd-align-items-center .fd-m-right-xs" year: ".fd-align-items-center~ .fd-align-items-center .fd-m-right-xs"
living_area: ".object-kenmerken-list:nth-child(8) .fd-align-items-center:nth-child(2) span" living_area: ".object-kenmerken-list:nth-child(8) .fd-align-items-center:nth-child(2) span"
perceel_area: ".object-kenmerken-list:nth-child(8) .fd-align-items-center:nth-child(5) span"
kind_of_house: ".object-kenmerken-list:nth-child(5) .fd-align-items-center:nth-child(2) span" kind_of_house: ".object-kenmerken-list:nth-child(5) .fd-align-items-center:nth-child(2) span"
building_type: ".object-kenmerken-list:nth-child(5) .fd-align-items-center:nth-child(4) span" building_type: ".object-kenmerken-list:nth-child(5) .fd-align-items-center:nth-child(4) span"
num_of_rooms: ".object-kenmerken-list:nth-child(11) .fd-align-items-center:nth-child(2)" num_of_rooms: ".object-kenmerken-list:nth-child(11) .fd-align-items-center:nth-child(2)"

View File

@ -174,8 +174,12 @@ def preprocess_data(df: pd.DataFrame, is_past: bool) -> pd.DataFrame:
df["price"] = df[price_col].apply(clean_price) df["price"] = df[price_col].apply(clean_price)
df = df[df["price"] != 0] df = df[df["price"] != 0]
df["living_area"] = df["living_area"].apply(clean_living_area) df["living_area"] = df["living_area"].apply(clean_living_area)
df["perceel_area"] = df["perceel_area"].apply(clean_living_area)
df = df[df["living_area"] != 0] df = df[df["living_area"] != 0]
df = df[df["perceel_area"] != 0]
df["price_m2"] = round(df.price / df.living_area, 1) df["price_m2"] = round(df.price / df.living_area, 1)
df["price_m2_total"] = round(df.price / (df.living_area + df.perceel_area), 1)
# Location # Location
df["zip"] = df["zip_code"].apply(lambda x: x[:4]) df["zip"] = df["zip_code"].apply(lambda x: x[:4])

View File

@ -198,6 +198,7 @@ class FundaScraper(object):
self.get_value_from_css(soup, self.selectors.size), self.get_value_from_css(soup, self.selectors.size),
self.get_value_from_css(soup, self.selectors.year), self.get_value_from_css(soup, self.selectors.year),
self.get_value_from_css(soup, self.selectors.living_area), self.get_value_from_css(soup, self.selectors.living_area),
self.get_value_from_css(soup, self.selectors.perceel_area),
self.get_value_from_css(soup, self.selectors.kind_of_house), self.get_value_from_css(soup, self.selectors.kind_of_house),
self.get_value_from_css(soup, self.selectors.building_type), self.get_value_from_css(soup, self.selectors.building_type),
self.get_value_from_css(soup, self.selectors.num_of_rooms), self.get_value_from_css(soup, self.selectors.num_of_rooms),

View File

@ -16,6 +16,7 @@ def input_data():
"size": ["100 m²"], "size": ["100 m²"],
"year": ["2000"], "year": ["2000"],
"living_area": ["78 m²"], "living_area": ["78 m²"],
"perceel_area": ["78 m²"],
"kind_of_house": ["Eengezinswoning"], "kind_of_house": ["Eengezinswoning"],
"building_type": ["Bestaande bouw"], "building_type": ["Bestaande bouw"],
"num_of_rooms": ["4 kamers (3 slaapkamers)"], "num_of_rooms": ["4 kamers (3 slaapkamers)"],

Binary file not shown.

View File

@ -24,9 +24,10 @@ out = []
# URL = "https://www.funda.nl/zoeken/koop?selected_area=%5B%22utrecht,15km%22%5D&price=%22-400000%22&object_type=%5B%22house%22%5D" # URL = "https://www.funda.nl/zoeken/koop?selected_area=%5B%22utrecht,15km%22%5D&price=%22-400000%22&object_type=%5B%22house%22%5D"
URL = "https://www.funda.nl/zoeken/koop?selected_area=%5B%22utrecht,30km%22%5D&price=%22-500000%22&object_type=%5B%22house%22%5D" URL = "https://www.funda.nl/zoeken/koop?selected_area=%5B%22utrecht,30km%22%5D&price=%22-500000%22&object_type=%5B%22house%22%5D"
NUM_PAGES = 1
def get_funda_data(): def get_funda_data():
# scraper = FundaScraper(url="nijkerk/beschikbaar/100000-400000/woonhuis/tuin/eengezinswoning/landhuis/+30km/", find_past=False, n_pages=81) # scraper = FundaScraper(url="nijkerk/beschikbaar/100000-400000/woonhuis/tuin/eengezinswoning/landhuis/+30km/", find_past=False, n_pages=81)
scraper = FundaScraper(url=URL, find_past=False, n_pages=150) scraper = FundaScraper(url=URL, find_past=False, n_pages=NUM_PAGES)
df = scraper.run() df = scraper.run()
return df return df
@ -36,6 +37,7 @@ def get_distances(out_dict, destination_location):
out_dict[key] = distance['routes'][0] out_dict[key] = distance['routes'][0]
def generate_json(houses): def generate_json(houses):
print("Gecoding distances...")
count = 0 count = 0
for i in tqdm.tqdm(range(len(houses))): for i in tqdm.tqdm(range(len(houses))):
count += 1 count += 1