added pricem2 stuff
This commit is contained in:
parent
1c3376bf7b
commit
587f6fc950
Binary file not shown.
Binary file not shown.
@ -14,10 +14,12 @@ keep_cols:
|
|||||||
- building_type
|
- building_type
|
||||||
- price
|
- price
|
||||||
- price_m2
|
- price_m2
|
||||||
|
- price_m2_total
|
||||||
- room
|
- room
|
||||||
- bedroom
|
- bedroom
|
||||||
- bathroom
|
- bathroom
|
||||||
- living_area
|
- living_area
|
||||||
|
- perceel_area
|
||||||
- energy_label
|
- energy_label
|
||||||
- zip
|
- zip
|
||||||
- address
|
- address
|
||||||
@ -38,6 +40,7 @@ css_selector:
|
|||||||
size: ".fd-m-right-xl--bp-m .fd-text--nowrap"
|
size: ".fd-m-right-xl--bp-m .fd-text--nowrap"
|
||||||
year: ".fd-align-items-center~ .fd-align-items-center .fd-m-right-xs"
|
year: ".fd-align-items-center~ .fd-align-items-center .fd-m-right-xs"
|
||||||
living_area: ".object-kenmerken-list:nth-child(8) .fd-align-items-center:nth-child(2) span"
|
living_area: ".object-kenmerken-list:nth-child(8) .fd-align-items-center:nth-child(2) span"
|
||||||
|
perceel_area: ".object-kenmerken-list:nth-child(8) .fd-align-items-center:nth-child(5) span"
|
||||||
kind_of_house: ".object-kenmerken-list:nth-child(5) .fd-align-items-center:nth-child(2) span"
|
kind_of_house: ".object-kenmerken-list:nth-child(5) .fd-align-items-center:nth-child(2) span"
|
||||||
building_type: ".object-kenmerken-list:nth-child(5) .fd-align-items-center:nth-child(4) span"
|
building_type: ".object-kenmerken-list:nth-child(5) .fd-align-items-center:nth-child(4) span"
|
||||||
num_of_rooms: ".object-kenmerken-list:nth-child(11) .fd-align-items-center:nth-child(2)"
|
num_of_rooms: ".object-kenmerken-list:nth-child(11) .fd-align-items-center:nth-child(2)"
|
||||||
|
@ -174,8 +174,12 @@ def preprocess_data(df: pd.DataFrame, is_past: bool) -> pd.DataFrame:
|
|||||||
df["price"] = df[price_col].apply(clean_price)
|
df["price"] = df[price_col].apply(clean_price)
|
||||||
df = df[df["price"] != 0]
|
df = df[df["price"] != 0]
|
||||||
df["living_area"] = df["living_area"].apply(clean_living_area)
|
df["living_area"] = df["living_area"].apply(clean_living_area)
|
||||||
|
df["perceel_area"] = df["perceel_area"].apply(clean_living_area)
|
||||||
df = df[df["living_area"] != 0]
|
df = df[df["living_area"] != 0]
|
||||||
|
df = df[df["perceel_area"] != 0]
|
||||||
|
|
||||||
df["price_m2"] = round(df.price / df.living_area, 1)
|
df["price_m2"] = round(df.price / df.living_area, 1)
|
||||||
|
df["price_m2_total"] = round(df.price / (df.living_area + df.perceel_area), 1)
|
||||||
|
|
||||||
# Location
|
# Location
|
||||||
df["zip"] = df["zip_code"].apply(lambda x: x[:4])
|
df["zip"] = df["zip_code"].apply(lambda x: x[:4])
|
||||||
|
@ -198,6 +198,7 @@ class FundaScraper(object):
|
|||||||
self.get_value_from_css(soup, self.selectors.size),
|
self.get_value_from_css(soup, self.selectors.size),
|
||||||
self.get_value_from_css(soup, self.selectors.year),
|
self.get_value_from_css(soup, self.selectors.year),
|
||||||
self.get_value_from_css(soup, self.selectors.living_area),
|
self.get_value_from_css(soup, self.selectors.living_area),
|
||||||
|
self.get_value_from_css(soup, self.selectors.perceel_area),
|
||||||
self.get_value_from_css(soup, self.selectors.kind_of_house),
|
self.get_value_from_css(soup, self.selectors.kind_of_house),
|
||||||
self.get_value_from_css(soup, self.selectors.building_type),
|
self.get_value_from_css(soup, self.selectors.building_type),
|
||||||
self.get_value_from_css(soup, self.selectors.num_of_rooms),
|
self.get_value_from_css(soup, self.selectors.num_of_rooms),
|
||||||
|
@ -16,6 +16,7 @@ def input_data():
|
|||||||
"size": ["100 m²"],
|
"size": ["100 m²"],
|
||||||
"year": ["2000"],
|
"year": ["2000"],
|
||||||
"living_area": ["78 m²"],
|
"living_area": ["78 m²"],
|
||||||
|
"perceel_area": ["78 m²"],
|
||||||
"kind_of_house": ["Eengezinswoning"],
|
"kind_of_house": ["Eengezinswoning"],
|
||||||
"building_type": ["Bestaande bouw"],
|
"building_type": ["Bestaande bouw"],
|
||||||
"num_of_rooms": ["4 kamers (3 slaapkamers)"],
|
"num_of_rooms": ["4 kamers (3 slaapkamers)"],
|
||||||
|
Binary file not shown.
@ -24,9 +24,10 @@ out = []
|
|||||||
|
|
||||||
# URL = "https://www.funda.nl/zoeken/koop?selected_area=%5B%22utrecht,15km%22%5D&price=%22-400000%22&object_type=%5B%22house%22%5D"
|
# URL = "https://www.funda.nl/zoeken/koop?selected_area=%5B%22utrecht,15km%22%5D&price=%22-400000%22&object_type=%5B%22house%22%5D"
|
||||||
URL = "https://www.funda.nl/zoeken/koop?selected_area=%5B%22utrecht,30km%22%5D&price=%22-500000%22&object_type=%5B%22house%22%5D"
|
URL = "https://www.funda.nl/zoeken/koop?selected_area=%5B%22utrecht,30km%22%5D&price=%22-500000%22&object_type=%5B%22house%22%5D"
|
||||||
|
NUM_PAGES = 1
|
||||||
def get_funda_data():
|
def get_funda_data():
|
||||||
# scraper = FundaScraper(url="nijkerk/beschikbaar/100000-400000/woonhuis/tuin/eengezinswoning/landhuis/+30km/", find_past=False, n_pages=81)
|
# scraper = FundaScraper(url="nijkerk/beschikbaar/100000-400000/woonhuis/tuin/eengezinswoning/landhuis/+30km/", find_past=False, n_pages=81)
|
||||||
scraper = FundaScraper(url=URL, find_past=False, n_pages=150)
|
scraper = FundaScraper(url=URL, find_past=False, n_pages=NUM_PAGES)
|
||||||
df = scraper.run()
|
df = scraper.run()
|
||||||
return df
|
return df
|
||||||
|
|
||||||
@ -36,6 +37,7 @@ def get_distances(out_dict, destination_location):
|
|||||||
out_dict[key] = distance['routes'][0]
|
out_dict[key] = distance['routes'][0]
|
||||||
|
|
||||||
def generate_json(houses):
|
def generate_json(houses):
|
||||||
|
print("Gecoding distances...")
|
||||||
count = 0
|
count = 0
|
||||||
for i in tqdm.tqdm(range(len(houses))):
|
for i in tqdm.tqdm(range(len(houses))):
|
||||||
count += 1
|
count += 1
|
||||||
|
Loading…
Reference in New Issue
Block a user