This commit is contained in:
Eljakim 2024-08-24 16:52:50 +00:00
parent 56377e8dc5
commit 9d433f7639
11 changed files with 9 additions and 8 deletions

View File

@ -10,7 +10,7 @@
"request": "launch",
"program": "usse.py",
"console": "integratedTerminal",
"justMyCode": true
"justMyCode": false
}
]
}

View File

@ -32,11 +32,11 @@ keep_cols:
- photo
css_selector:
url: none
price: ".object-header__price"
address: ".object-header__title"
price: "#__nuxt>section>main>div>div.relative.m-auto.flex.max-w-screen-lg.flex-col.lg\\:flex-row.xl\\:px-0>div.mt-4.px-4.lg\\:mt-6.lg\\:w-\\[70\\%\\].lg\\:pr-6>div.mb-4.border-b.border-solid.border-neutral-20.pb-4>div.mt-5.flex.flex-wrap.items-center.gap-3.lg\\:mt-6>div>div>span"
address: '#__nuxt>section>main>div>div.relative.m-auto.flex.max-w-screen-lg.flex-col.lg\\:flex-row.xl\\:px-0>div.mt-4.px-4.lg\\:mt-6.lg\\:w-\\[70\\%\\].lg\\:pr-6>div.mb-4.border-b.border-solid.border-neutral-20.pb-4>div.relative.flex.justify-between.pt-2.lg\\:pt-4>h1>span.block.text-2xl.font-bold.md\\:text-3xl.lg\\:text-4xl'
descrip: ".object-description-body"
listed_since: ".fd-align-items-center:nth-child(6) span"
zip_code: ".object-header__subtitle"
zip_code: '#__nuxt>section>main>div>div.relative.m-auto.flex.max-w-screen-lg.flex-col.lg\\:flex-row.xl\\:px-0>div.mt-4.px-4.lg\\:mt-6.lg\\:w-\\[70\\%\\].lg\\:pr-6>div.mb-4.border-b.border-solid.border-neutral-20.pb-4>div.relative.flex.justify-between.pt-2.lg\\:pt-4>h1>span.text-neutral-40'
size: ".fd-m-right-xl--bp-m .fd-text--nowrap"
year: ".fd-align-items-center~ .fd-align-items-center .fd-m-right-xs"
living_area: ".object-kenmerken-list:nth-child(8) .fd-align-items-center:nth-child(2) span"

View File

@ -165,8 +165,8 @@ def preprocess_data(df: pd.DataFrame, is_past: bool) -> pd.DataFrame:
keep_cols_sold = keep_cols + config.keep_cols.sold_data
# Info
df["house_id"] = df["url"].apply(lambda x: int(x.split("/")[-2].split("-")[1]))
df["house_type"] = df["url"].apply(lambda x: x.split("/")[-2].split("-")[0])
df["house_id"] = df["url"].apply(lambda x: int(x.split("/")[-2]))
df["house_type"] = df["url"].apply(lambda x: x.split("/")[-3].split("-")[0])
df = df[df["house_type"].isin(["appartement", "huis"])]
# Price

Binary file not shown.

View File

@ -23,8 +23,9 @@ saved_locations = []
out = []
# URL = "https://www.funda.nl/zoeken/koop?selected_area=%5B%22utrecht,15km%22%5D&price=%22-400000%22&object_type=%5B%22house%22%5D"
URL = "https://www.funda.nl/zoeken/koop?selected_area=%5B%22utrecht,30km%22%5D&price=%22-500000%22&object_type=%5B%22house%22%5D"
NUM_PAGES = 150
#URL = "https://www.funda.nl/zoeken/koop?selected_area=%5B%22utrecht,30km%22%5D&price=%22-500000%22&object_type=%5B%22house%22%5D"
URL = "https://www.funda.nl/zoeken/koop?selected_area=%5B%22utrecht,30km%22%5D&price=%22-500000%22&object_type=%5B%22house%22%5D&availability=%5B%22available%22%5D"
NUM_PAGES = 1 # 150
def get_funda_data():
# scraper = FundaScraper(url="nijkerk/beschikbaar/100000-400000/woonhuis/tuin/eengezinswoning/landhuis/+30km/", find_past=False, n_pages=81)
scraper = FundaScraper(url=URL, find_past=False, n_pages=NUM_PAGES)