diff --git a/scrape/funda-scraper/funda_scraper/__pycache__/preprocess.cpython-310.pyc b/scrape/funda-scraper/funda_scraper/__pycache__/preprocess.cpython-310.pyc index 959af01b..1d8222fe 100644 Binary files a/scrape/funda-scraper/funda_scraper/__pycache__/preprocess.cpython-310.pyc and b/scrape/funda-scraper/funda_scraper/__pycache__/preprocess.cpython-310.pyc differ diff --git a/scrape/funda-scraper/funda_scraper/__pycache__/scrape.cpython-310.pyc b/scrape/funda-scraper/funda_scraper/__pycache__/scrape.cpython-310.pyc index 66b72123..308b0686 100644 Binary files a/scrape/funda-scraper/funda_scraper/__pycache__/scrape.cpython-310.pyc and b/scrape/funda-scraper/funda_scraper/__pycache__/scrape.cpython-310.pyc differ diff --git a/scrape/funda-scraper/funda_scraper/config/config.yaml b/scrape/funda-scraper/funda_scraper/config/config.yaml index 4c231af4..df251991 100644 --- a/scrape/funda-scraper/funda_scraper/config/config.yaml +++ b/scrape/funda-scraper/funda_scraper/config/config.yaml @@ -14,10 +14,12 @@ keep_cols: - building_type - price - price_m2 + - price_m2_total - room - bedroom - bathroom - living_area + - perceel_area - energy_label - zip - address @@ -38,6 +40,7 @@ css_selector: size: ".fd-m-right-xl--bp-m .fd-text--nowrap" year: ".fd-align-items-center~ .fd-align-items-center .fd-m-right-xs" living_area: ".object-kenmerken-list:nth-child(8) .fd-align-items-center:nth-child(2) span" + perceel_area: ".object-kenmerken-list:nth-child(8) .fd-align-items-center:nth-child(5) span" kind_of_house: ".object-kenmerken-list:nth-child(5) .fd-align-items-center:nth-child(2) span" building_type: ".object-kenmerken-list:nth-child(5) .fd-align-items-center:nth-child(4) span" num_of_rooms: ".object-kenmerken-list:nth-child(11) .fd-align-items-center:nth-child(2)" diff --git a/scrape/funda-scraper/funda_scraper/preprocess.py b/scrape/funda-scraper/funda_scraper/preprocess.py index e9c017fa..7c783bbc 100644 --- a/scrape/funda-scraper/funda_scraper/preprocess.py +++ b/scrape/funda-scraper/funda_scraper/preprocess.py @@ -174,8 +174,12 @@ def preprocess_data(df: pd.DataFrame, is_past: bool) -> pd.DataFrame: df["price"] = df[price_col].apply(clean_price) df = df[df["price"] != 0] df["living_area"] = df["living_area"].apply(clean_living_area) + df["perceel_area"] = df["perceel_area"].apply(clean_living_area) df = df[df["living_area"] != 0] + df = df[df["perceel_area"] != 0] + df["price_m2"] = round(df.price / df.living_area, 1) + df["price_m2_total"] = round(df.price / (df.living_area + df.perceel_area), 1) # Location df["zip"] = df["zip_code"].apply(lambda x: x[:4]) diff --git a/scrape/funda-scraper/funda_scraper/scrape.py b/scrape/funda-scraper/funda_scraper/scrape.py index 1ea7b55e..30b805c6 100644 --- a/scrape/funda-scraper/funda_scraper/scrape.py +++ b/scrape/funda-scraper/funda_scraper/scrape.py @@ -198,6 +198,7 @@ class FundaScraper(object): self.get_value_from_css(soup, self.selectors.size), self.get_value_from_css(soup, self.selectors.year), self.get_value_from_css(soup, self.selectors.living_area), + self.get_value_from_css(soup, self.selectors.perceel_area), self.get_value_from_css(soup, self.selectors.kind_of_house), self.get_value_from_css(soup, self.selectors.building_type), self.get_value_from_css(soup, self.selectors.num_of_rooms), diff --git a/scrape/funda-scraper/tests/test_preprocess.py b/scrape/funda-scraper/tests/test_preprocess.py index fc33a104..9b28dc55 100644 --- a/scrape/funda-scraper/tests/test_preprocess.py +++ b/scrape/funda-scraper/tests/test_preprocess.py @@ -16,6 +16,7 @@ def input_data(): "size": ["100 m²"], "year": ["2000"], "living_area": ["78 m²"], + "perceel_area": ["78 m²"], "kind_of_house": ["Eengezinswoning"], "building_type": ["Bestaande bouw"], "num_of_rooms": ["4 kamers (3 slaapkamers)"], diff --git a/scrape/panda_dump.bin b/scrape/panda_dump.bin index 4ce53a13..4eb8d96b 100644 Binary files a/scrape/panda_dump.bin and b/scrape/panda_dump.bin differ diff --git a/scrape/usse.py b/scrape/usse.py index a8838da8..c7bff049 100644 --- a/scrape/usse.py +++ b/scrape/usse.py @@ -24,9 +24,10 @@ out = [] # URL = "https://www.funda.nl/zoeken/koop?selected_area=%5B%22utrecht,15km%22%5D&price=%22-400000%22&object_type=%5B%22house%22%5D" URL = "https://www.funda.nl/zoeken/koop?selected_area=%5B%22utrecht,30km%22%5D&price=%22-500000%22&object_type=%5B%22house%22%5D" +NUM_PAGES = 1 def get_funda_data(): # scraper = FundaScraper(url="nijkerk/beschikbaar/100000-400000/woonhuis/tuin/eengezinswoning/landhuis/+30km/", find_past=False, n_pages=81) - scraper = FundaScraper(url=URL, find_past=False, n_pages=150) + scraper = FundaScraper(url=URL, find_past=False, n_pages=NUM_PAGES) df = scraper.run() return df @@ -36,6 +37,7 @@ def get_distances(out_dict, destination_location): out_dict[key] = distance['routes'][0] def generate_json(houses): + print("Gecoding distances...") count = 0 for i in tqdm.tqdm(range(len(houses))): count += 1