added pricem2 stuff
This commit is contained in:
parent
1c3376bf7b
commit
587f6fc950
Binary file not shown.
Binary file not shown.
@ -14,10 +14,12 @@ keep_cols:
|
||||
- building_type
|
||||
- price
|
||||
- price_m2
|
||||
- price_m2_total
|
||||
- room
|
||||
- bedroom
|
||||
- bathroom
|
||||
- living_area
|
||||
- perceel_area
|
||||
- energy_label
|
||||
- zip
|
||||
- address
|
||||
@ -38,6 +40,7 @@ css_selector:
|
||||
size: ".fd-m-right-xl--bp-m .fd-text--nowrap"
|
||||
year: ".fd-align-items-center~ .fd-align-items-center .fd-m-right-xs"
|
||||
living_area: ".object-kenmerken-list:nth-child(8) .fd-align-items-center:nth-child(2) span"
|
||||
perceel_area: ".object-kenmerken-list:nth-child(8) .fd-align-items-center:nth-child(5) span"
|
||||
kind_of_house: ".object-kenmerken-list:nth-child(5) .fd-align-items-center:nth-child(2) span"
|
||||
building_type: ".object-kenmerken-list:nth-child(5) .fd-align-items-center:nth-child(4) span"
|
||||
num_of_rooms: ".object-kenmerken-list:nth-child(11) .fd-align-items-center:nth-child(2)"
|
||||
|
@ -174,8 +174,12 @@ def preprocess_data(df: pd.DataFrame, is_past: bool) -> pd.DataFrame:
|
||||
df["price"] = df[price_col].apply(clean_price)
|
||||
df = df[df["price"] != 0]
|
||||
df["living_area"] = df["living_area"].apply(clean_living_area)
|
||||
df["perceel_area"] = df["perceel_area"].apply(clean_living_area)
|
||||
df = df[df["living_area"] != 0]
|
||||
df = df[df["perceel_area"] != 0]
|
||||
|
||||
df["price_m2"] = round(df.price / df.living_area, 1)
|
||||
df["price_m2_total"] = round(df.price / (df.living_area + df.perceel_area), 1)
|
||||
|
||||
# Location
|
||||
df["zip"] = df["zip_code"].apply(lambda x: x[:4])
|
||||
|
@ -198,6 +198,7 @@ class FundaScraper(object):
|
||||
self.get_value_from_css(soup, self.selectors.size),
|
||||
self.get_value_from_css(soup, self.selectors.year),
|
||||
self.get_value_from_css(soup, self.selectors.living_area),
|
||||
self.get_value_from_css(soup, self.selectors.perceel_area),
|
||||
self.get_value_from_css(soup, self.selectors.kind_of_house),
|
||||
self.get_value_from_css(soup, self.selectors.building_type),
|
||||
self.get_value_from_css(soup, self.selectors.num_of_rooms),
|
||||
|
@ -16,6 +16,7 @@ def input_data():
|
||||
"size": ["100 m²"],
|
||||
"year": ["2000"],
|
||||
"living_area": ["78 m²"],
|
||||
"perceel_area": ["78 m²"],
|
||||
"kind_of_house": ["Eengezinswoning"],
|
||||
"building_type": ["Bestaande bouw"],
|
||||
"num_of_rooms": ["4 kamers (3 slaapkamers)"],
|
||||
|
Binary file not shown.
@ -24,9 +24,10 @@ out = []
|
||||
|
||||
# URL = "https://www.funda.nl/zoeken/koop?selected_area=%5B%22utrecht,15km%22%5D&price=%22-400000%22&object_type=%5B%22house%22%5D"
|
||||
URL = "https://www.funda.nl/zoeken/koop?selected_area=%5B%22utrecht,30km%22%5D&price=%22-500000%22&object_type=%5B%22house%22%5D"
|
||||
NUM_PAGES = 1
|
||||
def get_funda_data():
|
||||
# scraper = FundaScraper(url="nijkerk/beschikbaar/100000-400000/woonhuis/tuin/eengezinswoning/landhuis/+30km/", find_past=False, n_pages=81)
|
||||
scraper = FundaScraper(url=URL, find_past=False, n_pages=150)
|
||||
scraper = FundaScraper(url=URL, find_past=False, n_pages=NUM_PAGES)
|
||||
df = scraper.run()
|
||||
return df
|
||||
|
||||
@ -36,6 +37,7 @@ def get_distances(out_dict, destination_location):
|
||||
out_dict[key] = distance['routes'][0]
|
||||
|
||||
def generate_json(houses):
|
||||
print("Gecoding distances...")
|
||||
count = 0
|
||||
for i in tqdm.tqdm(range(len(houses))):
|
||||
count += 1
|
||||
|
Loading…
Reference in New Issue
Block a user