diff --git a/scrape/.vscode/launch.json b/scrape/.vscode/launch.json index 77a99a10..0c8991eb 100644 --- a/scrape/.vscode/launch.json +++ b/scrape/.vscode/launch.json @@ -10,7 +10,7 @@ "request": "launch", "program": "usse.py", "console": "integratedTerminal", - "justMyCode": true + "justMyCode": false } ] } \ No newline at end of file diff --git a/scrape/funda-scraper/funda_scraper/__pycache__/__init__.cpython-310.pyc b/scrape/funda-scraper/funda_scraper/__pycache__/__init__.cpython-310.pyc index a5f980b5..0266bc43 100644 Binary files a/scrape/funda-scraper/funda_scraper/__pycache__/__init__.cpython-310.pyc and b/scrape/funda-scraper/funda_scraper/__pycache__/__init__.cpython-310.pyc differ diff --git a/scrape/funda-scraper/funda_scraper/__pycache__/preprocess.cpython-310.pyc b/scrape/funda-scraper/funda_scraper/__pycache__/preprocess.cpython-310.pyc index 1d8222fe..4630529c 100644 Binary files a/scrape/funda-scraper/funda_scraper/__pycache__/preprocess.cpython-310.pyc and b/scrape/funda-scraper/funda_scraper/__pycache__/preprocess.cpython-310.pyc differ diff --git a/scrape/funda-scraper/funda_scraper/__pycache__/scrape.cpython-310.pyc b/scrape/funda-scraper/funda_scraper/__pycache__/scrape.cpython-310.pyc index 308b0686..82a512ec 100644 Binary files a/scrape/funda-scraper/funda_scraper/__pycache__/scrape.cpython-310.pyc and b/scrape/funda-scraper/funda_scraper/__pycache__/scrape.cpython-310.pyc differ diff --git a/scrape/funda-scraper/funda_scraper/__pycache__/utils.cpython-310.pyc b/scrape/funda-scraper/funda_scraper/__pycache__/utils.cpython-310.pyc index cbd8a5ee..54dbd590 100644 Binary files a/scrape/funda-scraper/funda_scraper/__pycache__/utils.cpython-310.pyc and b/scrape/funda-scraper/funda_scraper/__pycache__/utils.cpython-310.pyc differ diff --git a/scrape/funda-scraper/funda_scraper/config/__pycache__/__init__.cpython-310.pyc b/scrape/funda-scraper/funda_scraper/config/__pycache__/__init__.cpython-310.pyc index a36ddbc2..78137621 100644 Binary files a/scrape/funda-scraper/funda_scraper/config/__pycache__/__init__.cpython-310.pyc and b/scrape/funda-scraper/funda_scraper/config/__pycache__/__init__.cpython-310.pyc differ diff --git a/scrape/funda-scraper/funda_scraper/config/__pycache__/core.cpython-310.pyc b/scrape/funda-scraper/funda_scraper/config/__pycache__/core.cpython-310.pyc index c2adf80a..b469acde 100644 Binary files a/scrape/funda-scraper/funda_scraper/config/__pycache__/core.cpython-310.pyc and b/scrape/funda-scraper/funda_scraper/config/__pycache__/core.cpython-310.pyc differ diff --git a/scrape/funda-scraper/funda_scraper/config/config.yaml b/scrape/funda-scraper/funda_scraper/config/config.yaml index df251991..cc807b5b 100644 --- a/scrape/funda-scraper/funda_scraper/config/config.yaml +++ b/scrape/funda-scraper/funda_scraper/config/config.yaml @@ -32,11 +32,11 @@ keep_cols: - photo css_selector: url: none - price: ".object-header__price" - address: ".object-header__title" + price: "#__nuxt>section>main>div>div.relative.m-auto.flex.max-w-screen-lg.flex-col.lg\\:flex-row.xl\\:px-0>div.mt-4.px-4.lg\\:mt-6.lg\\:w-\\[70\\%\\].lg\\:pr-6>div.mb-4.border-b.border-solid.border-neutral-20.pb-4>div.mt-5.flex.flex-wrap.items-center.gap-3.lg\\:mt-6>div>div>span" + address: '#__nuxt>section>main>div>div.relative.m-auto.flex.max-w-screen-lg.flex-col.lg\\:flex-row.xl\\:px-0>div.mt-4.px-4.lg\\:mt-6.lg\\:w-\\[70\\%\\].lg\\:pr-6>div.mb-4.border-b.border-solid.border-neutral-20.pb-4>div.relative.flex.justify-between.pt-2.lg\\:pt-4>h1>span.block.text-2xl.font-bold.md\\:text-3xl.lg\\:text-4xl' descrip: ".object-description-body" listed_since: ".fd-align-items-center:nth-child(6) span" - zip_code: ".object-header__subtitle" + zip_code: '#__nuxt>section>main>div>div.relative.m-auto.flex.max-w-screen-lg.flex-col.lg\\:flex-row.xl\\:px-0>div.mt-4.px-4.lg\\:mt-6.lg\\:w-\\[70\\%\\].lg\\:pr-6>div.mb-4.border-b.border-solid.border-neutral-20.pb-4>div.relative.flex.justify-between.pt-2.lg\\:pt-4>h1>span.text-neutral-40' size: ".fd-m-right-xl--bp-m .fd-text--nowrap" year: ".fd-align-items-center~ .fd-align-items-center .fd-m-right-xs" living_area: ".object-kenmerken-list:nth-child(8) .fd-align-items-center:nth-child(2) span" diff --git a/scrape/funda-scraper/funda_scraper/preprocess.py b/scrape/funda-scraper/funda_scraper/preprocess.py index 7c783bbc..3f52d515 100644 --- a/scrape/funda-scraper/funda_scraper/preprocess.py +++ b/scrape/funda-scraper/funda_scraper/preprocess.py @@ -165,8 +165,8 @@ def preprocess_data(df: pd.DataFrame, is_past: bool) -> pd.DataFrame: keep_cols_sold = keep_cols + config.keep_cols.sold_data # Info - df["house_id"] = df["url"].apply(lambda x: int(x.split("/")[-2].split("-")[1])) - df["house_type"] = df["url"].apply(lambda x: x.split("/")[-2].split("-")[0]) + df["house_id"] = df["url"].apply(lambda x: int(x.split("/")[-2])) + df["house_type"] = df["url"].apply(lambda x: x.split("/")[-3].split("-")[0]) df = df[df["house_type"].isin(["appartement", "huis"])] # Price diff --git a/scrape/panda_dump.bin b/scrape/panda_dump.bin deleted file mode 100644 index 4eb8d96b..00000000 Binary files a/scrape/panda_dump.bin and /dev/null differ diff --git a/scrape/usse.py b/scrape/usse.py index 6c69f4e4..1be1f2ff 100644 --- a/scrape/usse.py +++ b/scrape/usse.py @@ -23,8 +23,9 @@ saved_locations = [] out = [] # URL = "https://www.funda.nl/zoeken/koop?selected_area=%5B%22utrecht,15km%22%5D&price=%22-400000%22&object_type=%5B%22house%22%5D" -URL = "https://www.funda.nl/zoeken/koop?selected_area=%5B%22utrecht,30km%22%5D&price=%22-500000%22&object_type=%5B%22house%22%5D" -NUM_PAGES = 150 +#URL = "https://www.funda.nl/zoeken/koop?selected_area=%5B%22utrecht,30km%22%5D&price=%22-500000%22&object_type=%5B%22house%22%5D" +URL = "https://www.funda.nl/zoeken/koop?selected_area=%5B%22utrecht,30km%22%5D&price=%22-500000%22&object_type=%5B%22house%22%5D&availability=%5B%22available%22%5D" +NUM_PAGES = 1 # 150 def get_funda_data(): # scraper = FundaScraper(url="nijkerk/beschikbaar/100000-400000/woonhuis/tuin/eengezinswoning/landhuis/+30km/", find_past=False, n_pages=81) scraper = FundaScraper(url=URL, find_past=False, n_pages=NUM_PAGES)