usse/funda-scraper/funda_scraper/scrape.py

252 lines
9.3 KiB
Python
Raw Normal View History

2023-02-20 22:38:24 +00:00
"""Main funda scraper module"""
import multiprocessing as mp
import os
import pandas as pd
import requests
from bs4 import BeautifulSoup
from typing import List, Dict
import datetime
from funda_scraper.config.core import config
from funda_scraper.preprocess import preprocess_data
from funda_scraper.utils import logger
from tqdm import tqdm
from tqdm.contrib.concurrent import process_map
class FundaScraper:
"""
Handles the main scraping function.
"""
def __init__(
self,
area: str = None,
want_to: str = "buy",
n_pages: int = 1,
find_past: bool = False,
url : str = "",
):
self.area = area.lower().replace(" ", "-") if isinstance(area, str) else area
self.want_to = want_to
self.find_past = find_past
self.n_pages = min(max(n_pages, 1), 999)
self.links: List[str] = []
self.raw_df = pd.DataFrame()
self.clean_df = pd.DataFrame()
self.base_url = config.base_url
self.url = url
self.selectors = config.css_selector
def __repr__(self):
return (
f"FundaScraper(area={self.area}, "
f"want_to={self.want_to}, "
f"n_pages={self.n_pages}, "
f"use_past_data={self.find_past})"
)
@property
def site_url(self) -> Dict[str, str]:
"""Return the corresponding urls."""
if self.url != "":
# https://www.funda.nl/koop/gemeente-huizen/0-350000/tuin/+10km/
# gemeente-huizen/0-350000/tuin/+10km/
return {
"close": f"{self.base_url}/koop/verkocht/{self.url}/",
"open": f"{self.base_url}/koop/{self.url}/",
}
else:
if self.to_buy:
return {
"close": f"{self.base_url}/koop/verkocht/{self.area}/",
"open": f"{self.base_url}/koop/{self.area}/",
}
else:
return {
"close": f"{self.base_url}/huur/{self.area}/verhuurd/",
"open": f"{self.base_url}/huur/{self.area}/",
}
@property
def to_buy(self) -> bool:
"""Whether to buy or not"""
if self.want_to.lower() in ["buy", "koop", "b"]:
return True
elif self.want_to.lower() in ["rent", "huur", "r"]:
return False
else:
raise ValueError("'want_to' must be 'either buy' or 'rent'.")
@staticmethod
def _check_dir() -> None:
"""Check whether a temporary directory for data"""
if not os.path.exists("data"):
os.makedirs("data")
@staticmethod
def _get_links_from_one_page(url: str) -> List[str]:
"""Scrape all the available housing items from one Funda search page."""
response = requests.get(url, headers=config.header)
soup = BeautifulSoup(response.text, "lxml")
house = soup.find_all(attrs={"data-object-url-tracking": "resultlist"})
item_list = [h.get("href") for h in house]
return list(set(item_list))
def init(
self,
area: str = None,
want_to: str = None,
n_pages: int = None,
find_past: bool = None,
) -> None:
"""Overwrite or initialise the searching scope."""
if area is not None:
self.area = area
if want_to is not None:
self.want_to = want_to
if n_pages is not None:
self.n_pages = n_pages
if find_past is not None:
self.find_past = find_past
def fetch_links(self) -> None:
"""Find all the available links across multiple pages. """
if self.url == "" and (self.area is None or self.want_to is None):
raise ValueError("You haven't set the area and what you're looking for.")
logger.info("*** Phase 1: Fetch all the available links from all pages *** ")
urls = []
main_url = self.site_url["close"] if self.find_past else self.site_url["open"]
for i in tqdm(range(0, self.n_pages + 1)):
item_list = self._get_links_from_one_page(main_url + f"p{i}")
if len(item_list) == 0:
self.n_pages = i
break
urls += item_list
urls = list(set(urls))
logger.info(
f"*** Got all the urls. {len(urls)} houses found in {self.n_pages} pages. ***"
)
self.links = ["https://www.funda.nl" + url for url in urls]
@staticmethod
def get_value(soup: BeautifulSoup, selector: str) -> str:
"""Use CSS selector to find certain features."""
try:
return soup.select(selector)[0].text
except IndexError:
return "na"
def scrape_from_url(self, url: str) -> List[str]:
"""Scrape all the features from one house item given a link. """
# Initialize for each page
response = requests.get(url, headers=config.header)
soup = BeautifulSoup(response.text, "lxml")
# Get the value according to respective CSS selectors
list_since_selector = (
self.selectors.listed_since
if self.to_buy
else ".fd-align-items-center:nth-child(7) span"
)
result = [
url,
self.get_value(soup, self.selectors.price),
self.get_value(soup, self.selectors.address),
self.get_value(soup, self.selectors.descrip),
self.get_value(soup, list_since_selector).replace("\n", ""),
self.get_value(soup, self.selectors.zip_code)
.replace("\n", "")
.replace("\r ", ""),
self.get_value(soup, self.selectors.size),
self.get_value(soup, self.selectors.year),
self.get_value(soup, self.selectors.living_area),
self.get_value(soup, self.selectors.kind_of_house),
self.get_value(soup, self.selectors.building_type),
self.get_value(soup, self.selectors.num_of_rooms).replace("\n", ""),
self.get_value(soup, self.selectors.num_of_bathrooms).replace("\n", ""),
self.get_value(soup, self.selectors.layout),
self.get_value(soup, self.selectors.energy_label).replace(
"\r\n ", ""
),
self.get_value(soup, self.selectors.insulation).replace("\n", ""),
self.get_value(soup, self.selectors.heating).replace("\n", ""),
self.get_value(soup, self.selectors.ownership).replace("\n", ""),
self.get_value(soup, self.selectors.exteriors),
self.get_value(soup, self.selectors.parking),
self.get_value(soup, self.selectors.neighborhood_name),
self.get_value(soup, self.selectors.date_list),
self.get_value(soup, self.selectors.date_sold),
self.get_value(soup, self.selectors.term),
self.get_value(soup, self.selectors.price_sold),
self.get_value(soup, self.selectors.last_ask_price).replace("\n", ""),
self.get_value(soup, self.selectors.last_ask_price_m2).split("\r")[0],
]
return result
def scrape_pages(self) -> None:
"""Scrape all the content acoss multiple pages."""
logger.info("*** Phase 2: Start scraping results from individual links ***")
df = pd.DataFrame({key: [] for key in self.selectors.keys()})
# Scrape pages with multiprocessing to improve efficiency
pools = mp.cpu_count()
content = process_map(self.scrape_from_url, self.links, max_workers=pools)
for i, c in enumerate(content):
df.loc[len(df)] = c
df["city"] = self.area
df["log_id"] = datetime.datetime.now().strftime("%Y%m-%d%H-%M%S")
logger.info(f"*** All scraping done: {df.shape[0]} results ***")
self.raw_df = df
return df
def save_csv(self, df: pd.DataFrame, filepath: str = None) -> None:
"""Save the result to a .csv file."""
if filepath is None:
self._check_dir()
date = str(datetime.datetime.now().date()).replace("-", "")
if self.find_past:
if self.to_buy:
status = "sold"
else:
status = "rented"
else:
if self.to_buy:
status = "selling"
else:
status = "renting"
filepath = (
f"./data/houseprice_{date}_{self.area}_{status}_{len(self.links)}.csv"
)
df.to_csv(filepath, index=False)
logger.info(f"*** File saved: {filepath}. ***")
def run(self, raw_data: bool = False, save: bool = False, filepath: str = None) -> pd.DataFrame:
"""Scrape all links and all content."""
self.fetch_links()
self.scrape_pages()
if raw_data:
if save:
self.save_csv(self.raw_df, filepath)
return self.raw_df
else:
return self.raw_df
# logger.info("*** Cleaning data ***")
# clean_df = preprocess_data(df=self.raw_df, is_past=self.find_past)
# self.clean_df = clean_df
# if save:
# self.save_csv(self.clean_df, filepath)
# return clean_df
if __name__ == "__main__":
scraper = FundaScraper(area="amsterdam", want_to="rent", find_past=False, n_pages=1)
df = scraper.run()
print(df.head())