update
This commit is contained in:
parent
56377e8dc5
commit
9d433f7639
2
scrape/.vscode/launch.json
vendored
2
scrape/.vscode/launch.json
vendored
@ -10,7 +10,7 @@
|
||||
"request": "launch",
|
||||
"program": "usse.py",
|
||||
"console": "integratedTerminal",
|
||||
"justMyCode": true
|
||||
"justMyCode": false
|
||||
}
|
||||
]
|
||||
}
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -32,11 +32,11 @@ keep_cols:
|
||||
- photo
|
||||
css_selector:
|
||||
url: none
|
||||
price: ".object-header__price"
|
||||
address: ".object-header__title"
|
||||
price: "#__nuxt>section>main>div>div.relative.m-auto.flex.max-w-screen-lg.flex-col.lg\\:flex-row.xl\\:px-0>div.mt-4.px-4.lg\\:mt-6.lg\\:w-\\[70\\%\\].lg\\:pr-6>div.mb-4.border-b.border-solid.border-neutral-20.pb-4>div.mt-5.flex.flex-wrap.items-center.gap-3.lg\\:mt-6>div>div>span"
|
||||
address: '#__nuxt>section>main>div>div.relative.m-auto.flex.max-w-screen-lg.flex-col.lg\\:flex-row.xl\\:px-0>div.mt-4.px-4.lg\\:mt-6.lg\\:w-\\[70\\%\\].lg\\:pr-6>div.mb-4.border-b.border-solid.border-neutral-20.pb-4>div.relative.flex.justify-between.pt-2.lg\\:pt-4>h1>span.block.text-2xl.font-bold.md\\:text-3xl.lg\\:text-4xl'
|
||||
descrip: ".object-description-body"
|
||||
listed_since: ".fd-align-items-center:nth-child(6) span"
|
||||
zip_code: ".object-header__subtitle"
|
||||
zip_code: '#__nuxt>section>main>div>div.relative.m-auto.flex.max-w-screen-lg.flex-col.lg\\:flex-row.xl\\:px-0>div.mt-4.px-4.lg\\:mt-6.lg\\:w-\\[70\\%\\].lg\\:pr-6>div.mb-4.border-b.border-solid.border-neutral-20.pb-4>div.relative.flex.justify-between.pt-2.lg\\:pt-4>h1>span.text-neutral-40'
|
||||
size: ".fd-m-right-xl--bp-m .fd-text--nowrap"
|
||||
year: ".fd-align-items-center~ .fd-align-items-center .fd-m-right-xs"
|
||||
living_area: ".object-kenmerken-list:nth-child(8) .fd-align-items-center:nth-child(2) span"
|
||||
|
@ -165,8 +165,8 @@ def preprocess_data(df: pd.DataFrame, is_past: bool) -> pd.DataFrame:
|
||||
keep_cols_sold = keep_cols + config.keep_cols.sold_data
|
||||
|
||||
# Info
|
||||
df["house_id"] = df["url"].apply(lambda x: int(x.split("/")[-2].split("-")[1]))
|
||||
df["house_type"] = df["url"].apply(lambda x: x.split("/")[-2].split("-")[0])
|
||||
df["house_id"] = df["url"].apply(lambda x: int(x.split("/")[-2]))
|
||||
df["house_type"] = df["url"].apply(lambda x: x.split("/")[-3].split("-")[0])
|
||||
df = df[df["house_type"].isin(["appartement", "huis"])]
|
||||
|
||||
# Price
|
||||
|
Binary file not shown.
@ -23,8 +23,9 @@ saved_locations = []
|
||||
out = []
|
||||
|
||||
# URL = "https://www.funda.nl/zoeken/koop?selected_area=%5B%22utrecht,15km%22%5D&price=%22-400000%22&object_type=%5B%22house%22%5D"
|
||||
URL = "https://www.funda.nl/zoeken/koop?selected_area=%5B%22utrecht,30km%22%5D&price=%22-500000%22&object_type=%5B%22house%22%5D"
|
||||
NUM_PAGES = 150
|
||||
#URL = "https://www.funda.nl/zoeken/koop?selected_area=%5B%22utrecht,30km%22%5D&price=%22-500000%22&object_type=%5B%22house%22%5D"
|
||||
URL = "https://www.funda.nl/zoeken/koop?selected_area=%5B%22utrecht,30km%22%5D&price=%22-500000%22&object_type=%5B%22house%22%5D&availability=%5B%22available%22%5D"
|
||||
NUM_PAGES = 1 # 150
|
||||
def get_funda_data():
|
||||
# scraper = FundaScraper(url="nijkerk/beschikbaar/100000-400000/woonhuis/tuin/eengezinswoning/landhuis/+30km/", find_past=False, n_pages=81)
|
||||
scraper = FundaScraper(url=URL, find_past=False, n_pages=NUM_PAGES)
|
||||
|
Loading…
Reference in New Issue
Block a user