358 lines
13 KiB
Python
358 lines
13 KiB
Python
"""Main funda scraper module"""
|
|
import argparse
|
|
import datetime
|
|
import json
|
|
import multiprocessing as mp
|
|
import os
|
|
from typing import List, Literal, Optional
|
|
|
|
import pandas as pd
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
from tqdm import tqdm
|
|
from tqdm.contrib.concurrent import process_map
|
|
|
|
from funda_scraper.config.core import config
|
|
from funda_scraper.preprocess import clean_list_date, preprocess_data
|
|
from funda_scraper.utils import logger
|
|
|
|
|
|
class FundaScraper(object):
|
|
"""
|
|
Handles the main scraping function.
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
want_to: str = "koop",
|
|
area: str = "",
|
|
page_start: int = 1,
|
|
n_pages: int = 1,
|
|
find_past: bool = False,
|
|
min_price: Optional[int] = None,
|
|
max_price: Optional[int] = None,
|
|
url : str = "",
|
|
):
|
|
# Init attributes
|
|
self.area = area.lower().replace(" ", "-")
|
|
self.want_to = want_to
|
|
self.find_past = find_past
|
|
self.page_start = max(page_start, 1)
|
|
self.n_pages = max(n_pages, 1)
|
|
self.page_end = self.page_start + self.n_pages - 1
|
|
self.min_price = min_price
|
|
self.max_price = max_price
|
|
|
|
# Instantiate along the way
|
|
self.links: List[str] = []
|
|
self.raw_df = pd.DataFrame()
|
|
self.clean_df = pd.DataFrame()
|
|
self.base_url = config.base_url
|
|
self.url = url
|
|
self.selectors = config.css_selector
|
|
|
|
def __repr__(self):
|
|
return (
|
|
f"FundaScraper(area={self.area}, "
|
|
f"want_to={self.want_to}, "
|
|
f"n_pages={self.n_pages}, "
|
|
f"page_start={self.page_start}, "
|
|
f"find_past={self.find_past})"
|
|
f"min_price={self.min_price})"
|
|
f"max_price={self.max_price})"
|
|
)
|
|
|
|
@property
|
|
def to_buy(self) -> bool:
|
|
"""Whether to buy or not"""
|
|
if self.want_to.lower() in ["buy", "koop", "b", "k"]:
|
|
return True
|
|
elif self.want_to.lower() in ["rent", "huur", "r", "h"]:
|
|
return False
|
|
else:
|
|
raise ValueError("'want_to' must be either 'buy' or 'rent'.")
|
|
|
|
@staticmethod
|
|
def _check_dir() -> None:
|
|
"""Check whether a temporary directory for data"""
|
|
if not os.path.exists("data"):
|
|
os.makedirs("data")
|
|
|
|
@staticmethod
|
|
def _get_links_from_one_parent(url: str) -> List[str]:
|
|
"""Scrape all the available housing items from one Funda search page."""
|
|
response = requests.get(url, headers=config.header)
|
|
soup = BeautifulSoup(response.text, "lxml")
|
|
|
|
script_tag = soup.find_all("script", {"type": "application/ld+json"})[0]
|
|
json_data = json.loads(script_tag.contents[0])
|
|
urls = [item["url"] for item in json_data["itemListElement"]]
|
|
return list(set(urls))
|
|
|
|
def reset(
|
|
self,
|
|
area: Optional[str] = None,
|
|
want_to: Optional[str] = None,
|
|
page_start: Optional[int] = None,
|
|
n_pages: Optional[int] = None,
|
|
find_past: Optional[bool] = None,
|
|
min_price: Optional[int] = None,
|
|
max_price: Optional[int] = None,
|
|
) -> None:
|
|
"""Overwrite or initialise the searching scope."""
|
|
if area is not None:
|
|
self.area = area
|
|
if want_to is not None:
|
|
self.want_to = want_to
|
|
if page_start is not None:
|
|
self.page_start = max(page_start, 1)
|
|
if n_pages is not None:
|
|
self.n_pages = max(n_pages, 1)
|
|
if find_past is not None:
|
|
self.find_past = find_past
|
|
if min_price is not None:
|
|
self.min_price = min_price
|
|
if max_price is not None:
|
|
self.max_price = max_price
|
|
|
|
def fetch_all_links(self, page_start: int = None, n_pages: int = None) -> None:
|
|
"""Find all the available links across multiple pages."""
|
|
|
|
page_start = self.page_start if page_start is None else page_start
|
|
n_pages = self.n_pages if n_pages is None else n_pages
|
|
|
|
logger.info("*** Phase 1: Fetch all the available links from all pages *** ")
|
|
urls = []
|
|
main_url = self._build_main_query_url()
|
|
|
|
for i in tqdm(range(page_start, page_start + n_pages)):
|
|
try:
|
|
item_list = self._get_links_from_one_parent(
|
|
f"{main_url}&search_result={i}"
|
|
)
|
|
urls += item_list
|
|
except IndexError:
|
|
self.page_end = i
|
|
logger.info(f"*** The last available page is {self.page_end} ***")
|
|
break
|
|
|
|
urls = list(set(urls))
|
|
logger.info(
|
|
f"*** Got all the urls. {len(urls)} houses found from {self.page_start} to {self.page_end} ***"
|
|
)
|
|
self.links = urls
|
|
|
|
def _build_main_query_url(self) -> str:
|
|
if self.url: # Override URL if provided.
|
|
main_url = self.url
|
|
else:
|
|
query = "koop" if self.to_buy else "huur"
|
|
main_url = f"{self.base_url}/zoeken/{query}?selected_area=%22{self.area}%22"
|
|
|
|
if self.find_past:
|
|
main_url = f"{main_url}&availability=%22unavailable%22"
|
|
|
|
if self.min_price is not None or self.max_price is not None:
|
|
min_price = "" if self.min_price is None else self.min_price
|
|
max_price = "" if self.max_price is None else self.max_price
|
|
main_url = f"{main_url}&price=%22{min_price}-{max_price}%22"
|
|
|
|
return main_url
|
|
|
|
@staticmethod
|
|
def get_value_from_css(soup: BeautifulSoup, selector: str) -> str:
|
|
"""Use CSS selector to find certain features."""
|
|
result = soup.select(selector)
|
|
if len(result) > 0:
|
|
result = result[0].text
|
|
else:
|
|
result = "na"
|
|
return result
|
|
|
|
def scrape_one_link(self, link: str) -> List[str]:
|
|
"""Scrape all the features from one house item given a link."""
|
|
|
|
# Initialize for each page
|
|
response = requests.get(link, headers=config.header)
|
|
soup = BeautifulSoup(response.text, "lxml")
|
|
|
|
# Get the value according to respective CSS selectors
|
|
if self.to_buy:
|
|
if self.find_past:
|
|
list_since_selector = self.selectors.date_list
|
|
else:
|
|
list_since_selector = self.selectors.listed_since
|
|
else:
|
|
if self.find_past:
|
|
list_since_selector = ".fd-align-items-center:nth-child(9) span"
|
|
else:
|
|
list_since_selector = ".fd-align-items-center:nth-child(7) span"
|
|
|
|
result = [
|
|
link,
|
|
self.get_value_from_css(soup, self.selectors.price),
|
|
self.get_value_from_css(soup, self.selectors.address),
|
|
self.get_value_from_css(soup, self.selectors.descrip),
|
|
self.get_value_from_css(soup, list_since_selector),
|
|
self.get_value_from_css(soup, self.selectors.zip_code),
|
|
self.get_value_from_css(soup, self.selectors.size),
|
|
self.get_value_from_css(soup, self.selectors.year),
|
|
self.get_value_from_css(soup, self.selectors.living_area),
|
|
self.get_value_from_css(soup, self.selectors.kind_of_house),
|
|
self.get_value_from_css(soup, self.selectors.building_type),
|
|
self.get_value_from_css(soup, self.selectors.num_of_rooms),
|
|
self.get_value_from_css(soup, self.selectors.num_of_bathrooms),
|
|
self.get_value_from_css(soup, self.selectors.layout),
|
|
self.get_value_from_css(soup, self.selectors.energy_label),
|
|
self.get_value_from_css(soup, self.selectors.insulation),
|
|
self.get_value_from_css(soup, self.selectors.heating),
|
|
self.get_value_from_css(soup, self.selectors.ownership),
|
|
self.get_value_from_css(soup, self.selectors.exteriors),
|
|
self.get_value_from_css(soup, self.selectors.parking),
|
|
self.get_value_from_css(soup, self.selectors.neighborhood_name),
|
|
self.get_value_from_css(soup, self.selectors.date_list),
|
|
self.get_value_from_css(soup, self.selectors.date_sold),
|
|
self.get_value_from_css(soup, self.selectors.term),
|
|
self.get_value_from_css(soup, self.selectors.price_sold),
|
|
self.get_value_from_css(soup, self.selectors.last_ask_price),
|
|
self.get_value_from_css(soup, self.selectors.last_ask_price_m2).split("\r")[
|
|
0
|
|
],
|
|
]
|
|
|
|
# Deal with list_since_selector especially, since its CSS varies sometimes
|
|
# if clean_list_date(result[4]) == "na":
|
|
# for i in range(6, 16):
|
|
# selector = f".fd-align-items-center:nth-child({i}) span"
|
|
# update_list_since = self.get_value_from_css(soup, selector)
|
|
# if clean_list_date(update_list_since) == "na":
|
|
# pass
|
|
# else:
|
|
# result[4] = update_list_since
|
|
|
|
photos_list = [p.get("data-lazy-srcset") for p in soup.select(self.selectors.photo)]
|
|
photos_string = ", ".join(photos_list)
|
|
|
|
# Clean up the retried result from one page
|
|
result = [r.replace("\n", "").replace("\r", "").strip() for r in result]
|
|
result.append(photos_string)
|
|
return result
|
|
|
|
def scrape_pages(self) -> None:
|
|
"""Scrape all the content acoss multiple pages."""
|
|
|
|
logger.info("*** Phase 2: Start scraping from individual links ***")
|
|
df = pd.DataFrame({key: [] for key in self.selectors.keys()})
|
|
|
|
# Scrape pages with multiprocessing to improve efficiency
|
|
# TODO: use asynctio instead
|
|
pools = mp.cpu_count()
|
|
content = process_map(self.scrape_one_link, self.links, max_workers=pools)
|
|
|
|
for i, c in enumerate(content):
|
|
df.loc[len(df)] = c
|
|
|
|
df["city"] = df["url"].map(lambda x: x.split("/")[4])
|
|
df["log_id"] = datetime.datetime.now().strftime("%Y%m-%d%H-%M%S")
|
|
if not self.find_past:
|
|
df = df.drop(["term", "price_sold", "date_sold"], axis=1)
|
|
logger.info(f"*** All scraping done: {df.shape[0]} results ***")
|
|
self.raw_df = df
|
|
|
|
def save_csv(self, df: pd.DataFrame, filepath: str = None) -> None:
|
|
"""Save the result to a .csv file."""
|
|
if filepath is None:
|
|
self._check_dir()
|
|
date = str(datetime.datetime.now().date()).replace("-", "")
|
|
status = "unavailable" if self.find_past else "unavailable"
|
|
want_to = "buy" if self.to_buy else "rent"
|
|
filepath = f"./data/houseprice_{date}_{self.area}_{want_to}_{status}_{len(self.links)}.csv"
|
|
df.to_csv(filepath, index=False)
|
|
logger.info(f"*** File saved: {filepath}. ***")
|
|
|
|
def run(
|
|
self, raw_data: bool = False, save: bool = False, filepath: str = None
|
|
) -> pd.DataFrame:
|
|
"""
|
|
Scrape all links and all content.
|
|
|
|
:param raw_data: if true, the data won't be pre-processed
|
|
:param save: if true, the data will be saved as a csv file
|
|
:param filepath: the name for the file
|
|
:return: the (pre-processed) dataframe from scraping
|
|
"""
|
|
self.fetch_all_links()
|
|
self.scrape_pages()
|
|
|
|
if raw_data:
|
|
df = self.raw_df
|
|
else:
|
|
logger.info("*** Cleaning data ***")
|
|
df = preprocess_data(df=self.raw_df, is_past=self.find_past)
|
|
self.clean_df = df
|
|
|
|
if save:
|
|
self.save_csv(df, filepath)
|
|
|
|
logger.info("*** Done! ***")
|
|
return df
|
|
|
|
|
|
if __name__ == "__main__":
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument(
|
|
"--area",
|
|
type=str,
|
|
help="Specify which area you are looking for",
|
|
default="amsterdam",
|
|
)
|
|
parser.add_argument(
|
|
"--want_to",
|
|
type=str,
|
|
help="Specify you want to 'rent' or 'buy'",
|
|
default="rent",
|
|
)
|
|
parser.add_argument(
|
|
"--find_past",
|
|
type=bool,
|
|
help="Indicate whether you want to use hisotrical data or not",
|
|
default=False,
|
|
)
|
|
parser.add_argument(
|
|
"--page_start", type=int, help="Specify which page to start scraping", default=1
|
|
)
|
|
parser.add_argument(
|
|
"--n_pages", type=int, help="Specify how many pages to scrape", default=1
|
|
)
|
|
parser.add_argument(
|
|
"--min_price", type=int, help="Specify the min price", default=None
|
|
)
|
|
parser.add_argument(
|
|
"--max_price", type=int, help="Specify the max price", default=None
|
|
)
|
|
parser.add_argument(
|
|
"--raw_data",
|
|
type=bool,
|
|
help="Indicate whether you want the raw scraping result or preprocessed one",
|
|
default=False,
|
|
)
|
|
parser.add_argument(
|
|
"--save",
|
|
type=bool,
|
|
help="Indicate whether you want to save the data or not",
|
|
default=True,
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
scraper = FundaScraper(
|
|
area=args.area,
|
|
want_to=args.want_to,
|
|
find_past=args.find_past,
|
|
page_start=args.page_start,
|
|
n_pages=args.n_pages,
|
|
min_price=args.min_price,
|
|
max_price=args.max_price,
|
|
)
|
|
df = scraper.run(raw_data=args.raw_data, save=args.save)
|
|
print(df.head())
|