252 lines
9.3 KiB
Python
252 lines
9.3 KiB
Python
|
"""Main funda scraper module"""
|
||
|
import multiprocessing as mp
|
||
|
import os
|
||
|
import pandas as pd
|
||
|
import requests
|
||
|
from bs4 import BeautifulSoup
|
||
|
from typing import List, Dict
|
||
|
import datetime
|
||
|
from funda_scraper.config.core import config
|
||
|
from funda_scraper.preprocess import preprocess_data
|
||
|
from funda_scraper.utils import logger
|
||
|
from tqdm import tqdm
|
||
|
from tqdm.contrib.concurrent import process_map
|
||
|
|
||
|
|
||
|
class FundaScraper:
|
||
|
"""
|
||
|
Handles the main scraping function.
|
||
|
"""
|
||
|
|
||
|
def __init__(
|
||
|
self,
|
||
|
area: str = None,
|
||
|
want_to: str = "buy",
|
||
|
n_pages: int = 1,
|
||
|
find_past: bool = False,
|
||
|
url : str = "",
|
||
|
):
|
||
|
self.area = area.lower().replace(" ", "-") if isinstance(area, str) else area
|
||
|
self.want_to = want_to
|
||
|
self.find_past = find_past
|
||
|
self.n_pages = min(max(n_pages, 1), 999)
|
||
|
self.links: List[str] = []
|
||
|
self.raw_df = pd.DataFrame()
|
||
|
self.clean_df = pd.DataFrame()
|
||
|
self.base_url = config.base_url
|
||
|
self.url = url
|
||
|
self.selectors = config.css_selector
|
||
|
|
||
|
def __repr__(self):
|
||
|
return (
|
||
|
f"FundaScraper(area={self.area}, "
|
||
|
f"want_to={self.want_to}, "
|
||
|
f"n_pages={self.n_pages}, "
|
||
|
f"use_past_data={self.find_past})"
|
||
|
)
|
||
|
|
||
|
@property
|
||
|
def site_url(self) -> Dict[str, str]:
|
||
|
"""Return the corresponding urls."""
|
||
|
if self.url != "":
|
||
|
# https://www.funda.nl/koop/gemeente-huizen/0-350000/tuin/+10km/
|
||
|
# gemeente-huizen/0-350000/tuin/+10km/
|
||
|
return {
|
||
|
"close": f"{self.base_url}/koop/verkocht/{self.url}/",
|
||
|
"open": f"{self.base_url}/koop/{self.url}/",
|
||
|
}
|
||
|
else:
|
||
|
if self.to_buy:
|
||
|
return {
|
||
|
"close": f"{self.base_url}/koop/verkocht/{self.area}/",
|
||
|
"open": f"{self.base_url}/koop/{self.area}/",
|
||
|
}
|
||
|
else:
|
||
|
return {
|
||
|
"close": f"{self.base_url}/huur/{self.area}/verhuurd/",
|
||
|
"open": f"{self.base_url}/huur/{self.area}/",
|
||
|
}
|
||
|
|
||
|
@property
|
||
|
def to_buy(self) -> bool:
|
||
|
"""Whether to buy or not"""
|
||
|
if self.want_to.lower() in ["buy", "koop", "b"]:
|
||
|
return True
|
||
|
elif self.want_to.lower() in ["rent", "huur", "r"]:
|
||
|
return False
|
||
|
else:
|
||
|
raise ValueError("'want_to' must be 'either buy' or 'rent'.")
|
||
|
|
||
|
@staticmethod
|
||
|
def _check_dir() -> None:
|
||
|
"""Check whether a temporary directory for data"""
|
||
|
if not os.path.exists("data"):
|
||
|
os.makedirs("data")
|
||
|
|
||
|
@staticmethod
|
||
|
def _get_links_from_one_page(url: str) -> List[str]:
|
||
|
"""Scrape all the available housing items from one Funda search page."""
|
||
|
response = requests.get(url, headers=config.header)
|
||
|
soup = BeautifulSoup(response.text, "lxml")
|
||
|
house = soup.find_all(attrs={"data-object-url-tracking": "resultlist"})
|
||
|
item_list = [h.get("href") for h in house]
|
||
|
return list(set(item_list))
|
||
|
|
||
|
def init(
|
||
|
self,
|
||
|
area: str = None,
|
||
|
want_to: str = None,
|
||
|
n_pages: int = None,
|
||
|
find_past: bool = None,
|
||
|
) -> None:
|
||
|
"""Overwrite or initialise the searching scope."""
|
||
|
if area is not None:
|
||
|
self.area = area
|
||
|
if want_to is not None:
|
||
|
self.want_to = want_to
|
||
|
if n_pages is not None:
|
||
|
self.n_pages = n_pages
|
||
|
if find_past is not None:
|
||
|
self.find_past = find_past
|
||
|
|
||
|
def fetch_links(self) -> None:
|
||
|
"""Find all the available links across multiple pages. """
|
||
|
if self.url == "" and (self.area is None or self.want_to is None):
|
||
|
raise ValueError("You haven't set the area and what you're looking for.")
|
||
|
|
||
|
logger.info("*** Phase 1: Fetch all the available links from all pages *** ")
|
||
|
|
||
|
urls = []
|
||
|
main_url = self.site_url["close"] if self.find_past else self.site_url["open"]
|
||
|
for i in tqdm(range(0, self.n_pages + 1)):
|
||
|
item_list = self._get_links_from_one_page(main_url + f"p{i}")
|
||
|
if len(item_list) == 0:
|
||
|
self.n_pages = i
|
||
|
break
|
||
|
urls += item_list
|
||
|
urls = list(set(urls))
|
||
|
logger.info(
|
||
|
f"*** Got all the urls. {len(urls)} houses found in {self.n_pages} pages. ***"
|
||
|
)
|
||
|
self.links = ["https://www.funda.nl" + url for url in urls]
|
||
|
|
||
|
@staticmethod
|
||
|
def get_value(soup: BeautifulSoup, selector: str) -> str:
|
||
|
"""Use CSS selector to find certain features."""
|
||
|
try:
|
||
|
return soup.select(selector)[0].text
|
||
|
except IndexError:
|
||
|
return "na"
|
||
|
|
||
|
def scrape_from_url(self, url: str) -> List[str]:
|
||
|
"""Scrape all the features from one house item given a link. """
|
||
|
|
||
|
# Initialize for each page
|
||
|
response = requests.get(url, headers=config.header)
|
||
|
soup = BeautifulSoup(response.text, "lxml")
|
||
|
|
||
|
# Get the value according to respective CSS selectors
|
||
|
list_since_selector = (
|
||
|
self.selectors.listed_since
|
||
|
if self.to_buy
|
||
|
else ".fd-align-items-center:nth-child(7) span"
|
||
|
)
|
||
|
result = [
|
||
|
url,
|
||
|
self.get_value(soup, self.selectors.price),
|
||
|
self.get_value(soup, self.selectors.address),
|
||
|
self.get_value(soup, self.selectors.descrip),
|
||
|
self.get_value(soup, list_since_selector).replace("\n", ""),
|
||
|
self.get_value(soup, self.selectors.zip_code)
|
||
|
.replace("\n", "")
|
||
|
.replace("\r ", ""),
|
||
|
self.get_value(soup, self.selectors.size),
|
||
|
self.get_value(soup, self.selectors.year),
|
||
|
self.get_value(soup, self.selectors.living_area),
|
||
|
self.get_value(soup, self.selectors.kind_of_house),
|
||
|
self.get_value(soup, self.selectors.building_type),
|
||
|
self.get_value(soup, self.selectors.num_of_rooms).replace("\n", ""),
|
||
|
self.get_value(soup, self.selectors.num_of_bathrooms).replace("\n", ""),
|
||
|
self.get_value(soup, self.selectors.layout),
|
||
|
self.get_value(soup, self.selectors.energy_label).replace(
|
||
|
"\r\n ", ""
|
||
|
),
|
||
|
self.get_value(soup, self.selectors.insulation).replace("\n", ""),
|
||
|
self.get_value(soup, self.selectors.heating).replace("\n", ""),
|
||
|
self.get_value(soup, self.selectors.ownership).replace("\n", ""),
|
||
|
self.get_value(soup, self.selectors.exteriors),
|
||
|
self.get_value(soup, self.selectors.parking),
|
||
|
self.get_value(soup, self.selectors.neighborhood_name),
|
||
|
self.get_value(soup, self.selectors.date_list),
|
||
|
self.get_value(soup, self.selectors.date_sold),
|
||
|
self.get_value(soup, self.selectors.term),
|
||
|
self.get_value(soup, self.selectors.price_sold),
|
||
|
self.get_value(soup, self.selectors.last_ask_price).replace("\n", ""),
|
||
|
self.get_value(soup, self.selectors.last_ask_price_m2).split("\r")[0],
|
||
|
]
|
||
|
|
||
|
return result
|
||
|
|
||
|
def scrape_pages(self) -> None:
|
||
|
"""Scrape all the content acoss multiple pages."""
|
||
|
|
||
|
logger.info("*** Phase 2: Start scraping results from individual links ***")
|
||
|
df = pd.DataFrame({key: [] for key in self.selectors.keys()})
|
||
|
|
||
|
# Scrape pages with multiprocessing to improve efficiency
|
||
|
pools = mp.cpu_count()
|
||
|
content = process_map(self.scrape_from_url, self.links, max_workers=pools)
|
||
|
|
||
|
for i, c in enumerate(content):
|
||
|
df.loc[len(df)] = c
|
||
|
|
||
|
df["city"] = self.area
|
||
|
df["log_id"] = datetime.datetime.now().strftime("%Y%m-%d%H-%M%S")
|
||
|
logger.info(f"*** All scraping done: {df.shape[0]} results ***")
|
||
|
self.raw_df = df
|
||
|
return df
|
||
|
|
||
|
def save_csv(self, df: pd.DataFrame, filepath: str = None) -> None:
|
||
|
"""Save the result to a .csv file."""
|
||
|
if filepath is None:
|
||
|
self._check_dir()
|
||
|
date = str(datetime.datetime.now().date()).replace("-", "")
|
||
|
if self.find_past:
|
||
|
if self.to_buy:
|
||
|
status = "sold"
|
||
|
else:
|
||
|
status = "rented"
|
||
|
else:
|
||
|
if self.to_buy:
|
||
|
status = "selling"
|
||
|
else:
|
||
|
status = "renting"
|
||
|
filepath = (
|
||
|
f"./data/houseprice_{date}_{self.area}_{status}_{len(self.links)}.csv"
|
||
|
)
|
||
|
df.to_csv(filepath, index=False)
|
||
|
logger.info(f"*** File saved: {filepath}. ***")
|
||
|
|
||
|
def run(self, raw_data: bool = False, save: bool = False, filepath: str = None) -> pd.DataFrame:
|
||
|
"""Scrape all links and all content."""
|
||
|
self.fetch_links()
|
||
|
self.scrape_pages()
|
||
|
|
||
|
if raw_data:
|
||
|
if save:
|
||
|
self.save_csv(self.raw_df, filepath)
|
||
|
return self.raw_df
|
||
|
else:
|
||
|
return self.raw_df
|
||
|
# logger.info("*** Cleaning data ***")
|
||
|
# clean_df = preprocess_data(df=self.raw_df, is_past=self.find_past)
|
||
|
# self.clean_df = clean_df
|
||
|
# if save:
|
||
|
# self.save_csv(self.clean_df, filepath)
|
||
|
# return clean_df
|
||
|
|
||
|
|
||
|
if __name__ == "__main__":
|
||
|
scraper = FundaScraper(area="amsterdam", want_to="rent", find_past=False, n_pages=1)
|
||
|
df = scraper.run()
|
||
|
print(df.head())
|