"""Main funda scraper module""" import multiprocessing as mp import os import pandas as pd import requests from bs4 import BeautifulSoup from typing import List, Dict import datetime from funda_scraper.config.core import config from funda_scraper.preprocess import preprocess_data from funda_scraper.utils import logger from tqdm import tqdm from tqdm.contrib.concurrent import process_map class FundaScraper: """ Handles the main scraping function. """ def __init__( self, area: str = None, want_to: str = "buy", n_pages: int = 1, find_past: bool = False, url : str = "", ): self.area = area.lower().replace(" ", "-") if isinstance(area, str) else area self.want_to = want_to self.find_past = find_past self.n_pages = min(max(n_pages, 1), 999) self.links: List[str] = [] self.raw_df = pd.DataFrame() self.clean_df = pd.DataFrame() self.base_url = config.base_url self.url = url self.selectors = config.css_selector def __repr__(self): return ( f"FundaScraper(area={self.area}, " f"want_to={self.want_to}, " f"n_pages={self.n_pages}, " f"use_past_data={self.find_past})" ) @property def site_url(self) -> Dict[str, str]: """Return the corresponding urls.""" if self.url != "": # https://www.funda.nl/koop/gemeente-huizen/0-350000/tuin/+10km/ # gemeente-huizen/0-350000/tuin/+10km/ return { "close": f"{self.base_url}/koop/verkocht/{self.url}/", "open": f"{self.base_url}/koop/{self.url}/", } else: if self.to_buy: return { "close": f"{self.base_url}/koop/verkocht/{self.area}/", "open": f"{self.base_url}/koop/{self.area}/", } else: return { "close": f"{self.base_url}/huur/{self.area}/verhuurd/", "open": f"{self.base_url}/huur/{self.area}/", } @property def to_buy(self) -> bool: """Whether to buy or not""" if self.want_to.lower() in ["buy", "koop", "b"]: return True elif self.want_to.lower() in ["rent", "huur", "r"]: return False else: raise ValueError("'want_to' must be 'either buy' or 'rent'.") @staticmethod def _check_dir() -> None: """Check whether a temporary directory for data""" if not os.path.exists("data"): os.makedirs("data") @staticmethod def _get_links_from_one_page(url: str) -> List[str]: """Scrape all the available housing items from one Funda search page.""" response = requests.get(url, headers=config.header) soup = BeautifulSoup(response.text, "lxml") house = soup.find_all(attrs={"data-object-url-tracking": "resultlist"}) item_list = [h.get("href") for h in house] return list(set(item_list)) def init( self, area: str = None, want_to: str = None, n_pages: int = None, find_past: bool = None, ) -> None: """Overwrite or initialise the searching scope.""" if area is not None: self.area = area if want_to is not None: self.want_to = want_to if n_pages is not None: self.n_pages = n_pages if find_past is not None: self.find_past = find_past def fetch_links(self) -> None: """Find all the available links across multiple pages. """ if self.url == "" and (self.area is None or self.want_to is None): raise ValueError("You haven't set the area and what you're looking for.") logger.info("*** Phase 1: Fetch all the available links from all pages *** ") urls = [] main_url = self.site_url["close"] if self.find_past else self.site_url["open"] for i in tqdm(range(0, self.n_pages + 1)): item_list = self._get_links_from_one_page(main_url + f"p{i}") if len(item_list) == 0: self.n_pages = i break urls += item_list urls = list(set(urls)) logger.info( f"*** Got all the urls. {len(urls)} houses found in {self.n_pages} pages. ***" ) self.links = ["https://www.funda.nl" + url for url in urls] @staticmethod def get_value(soup: BeautifulSoup, selector: str) -> str: """Use CSS selector to find certain features.""" try: return soup.select(selector)[0].text except IndexError: return "na" def scrape_from_url(self, url: str) -> List[str]: """Scrape all the features from one house item given a link. """ # Initialize for each page response = requests.get(url, headers=config.header) soup = BeautifulSoup(response.text, "lxml") # Get the value according to respective CSS selectors list_since_selector = ( self.selectors.listed_since if self.to_buy else ".fd-align-items-center:nth-child(7) span" ) result = [ url, self.get_value(soup, self.selectors.price), self.get_value(soup, self.selectors.address), self.get_value(soup, self.selectors.descrip), self.get_value(soup, list_since_selector).replace("\n", ""), self.get_value(soup, self.selectors.zip_code) .replace("\n", "") .replace("\r ", ""), self.get_value(soup, self.selectors.size), self.get_value(soup, self.selectors.year), self.get_value(soup, self.selectors.living_area), self.get_value(soup, self.selectors.kind_of_house), self.get_value(soup, self.selectors.building_type), self.get_value(soup, self.selectors.num_of_rooms).replace("\n", ""), self.get_value(soup, self.selectors.num_of_bathrooms).replace("\n", ""), self.get_value(soup, self.selectors.layout), self.get_value(soup, self.selectors.energy_label).replace( "\r\n ", "" ), self.get_value(soup, self.selectors.insulation).replace("\n", ""), self.get_value(soup, self.selectors.heating).replace("\n", ""), self.get_value(soup, self.selectors.ownership).replace("\n", ""), self.get_value(soup, self.selectors.exteriors), self.get_value(soup, self.selectors.parking), self.get_value(soup, self.selectors.neighborhood_name), self.get_value(soup, self.selectors.date_list), self.get_value(soup, self.selectors.date_sold), self.get_value(soup, self.selectors.term), self.get_value(soup, self.selectors.price_sold), self.get_value(soup, self.selectors.last_ask_price).replace("\n", ""), self.get_value(soup, self.selectors.last_ask_price_m2).split("\r")[0], ] return result def scrape_pages(self) -> None: """Scrape all the content acoss multiple pages.""" logger.info("*** Phase 2: Start scraping results from individual links ***") df = pd.DataFrame({key: [] for key in self.selectors.keys()}) # Scrape pages with multiprocessing to improve efficiency pools = mp.cpu_count() content = process_map(self.scrape_from_url, self.links, max_workers=pools) for i, c in enumerate(content): df.loc[len(df)] = c df["city"] = self.area df["log_id"] = datetime.datetime.now().strftime("%Y%m-%d%H-%M%S") logger.info(f"*** All scraping done: {df.shape[0]} results ***") self.raw_df = df return df def save_csv(self, df: pd.DataFrame, filepath: str = None) -> None: """Save the result to a .csv file.""" if filepath is None: self._check_dir() date = str(datetime.datetime.now().date()).replace("-", "") if self.find_past: if self.to_buy: status = "sold" else: status = "rented" else: if self.to_buy: status = "selling" else: status = "renting" filepath = ( f"./data/houseprice_{date}_{self.area}_{status}_{len(self.links)}.csv" ) df.to_csv(filepath, index=False) logger.info(f"*** File saved: {filepath}. ***") def run(self, raw_data: bool = False, save: bool = False, filepath: str = None) -> pd.DataFrame: """Scrape all links and all content.""" self.fetch_links() self.scrape_pages() if raw_data: if save: self.save_csv(self.raw_df, filepath) return self.raw_df else: return self.raw_df # logger.info("*** Cleaning data ***") # clean_df = preprocess_data(df=self.raw_df, is_past=self.find_past) # self.clean_df = clean_df # if save: # self.save_csv(self.clean_df, filepath) # return clean_df if __name__ == "__main__": scraper = FundaScraper(area="amsterdam", want_to="rent", find_past=False, n_pages=1) df = scraper.run() print(df.head())