update
This commit is contained in:
parent
56377e8dc5
commit
9d433f7639
2
scrape/.vscode/launch.json
vendored
2
scrape/.vscode/launch.json
vendored
@ -10,7 +10,7 @@
|
|||||||
"request": "launch",
|
"request": "launch",
|
||||||
"program": "usse.py",
|
"program": "usse.py",
|
||||||
"console": "integratedTerminal",
|
"console": "integratedTerminal",
|
||||||
"justMyCode": true
|
"justMyCode": false
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -32,11 +32,11 @@ keep_cols:
|
|||||||
- photo
|
- photo
|
||||||
css_selector:
|
css_selector:
|
||||||
url: none
|
url: none
|
||||||
price: ".object-header__price"
|
price: "#__nuxt>section>main>div>div.relative.m-auto.flex.max-w-screen-lg.flex-col.lg\\:flex-row.xl\\:px-0>div.mt-4.px-4.lg\\:mt-6.lg\\:w-\\[70\\%\\].lg\\:pr-6>div.mb-4.border-b.border-solid.border-neutral-20.pb-4>div.mt-5.flex.flex-wrap.items-center.gap-3.lg\\:mt-6>div>div>span"
|
||||||
address: ".object-header__title"
|
address: '#__nuxt>section>main>div>div.relative.m-auto.flex.max-w-screen-lg.flex-col.lg\\:flex-row.xl\\:px-0>div.mt-4.px-4.lg\\:mt-6.lg\\:w-\\[70\\%\\].lg\\:pr-6>div.mb-4.border-b.border-solid.border-neutral-20.pb-4>div.relative.flex.justify-between.pt-2.lg\\:pt-4>h1>span.block.text-2xl.font-bold.md\\:text-3xl.lg\\:text-4xl'
|
||||||
descrip: ".object-description-body"
|
descrip: ".object-description-body"
|
||||||
listed_since: ".fd-align-items-center:nth-child(6) span"
|
listed_since: ".fd-align-items-center:nth-child(6) span"
|
||||||
zip_code: ".object-header__subtitle"
|
zip_code: '#__nuxt>section>main>div>div.relative.m-auto.flex.max-w-screen-lg.flex-col.lg\\:flex-row.xl\\:px-0>div.mt-4.px-4.lg\\:mt-6.lg\\:w-\\[70\\%\\].lg\\:pr-6>div.mb-4.border-b.border-solid.border-neutral-20.pb-4>div.relative.flex.justify-between.pt-2.lg\\:pt-4>h1>span.text-neutral-40'
|
||||||
size: ".fd-m-right-xl--bp-m .fd-text--nowrap"
|
size: ".fd-m-right-xl--bp-m .fd-text--nowrap"
|
||||||
year: ".fd-align-items-center~ .fd-align-items-center .fd-m-right-xs"
|
year: ".fd-align-items-center~ .fd-align-items-center .fd-m-right-xs"
|
||||||
living_area: ".object-kenmerken-list:nth-child(8) .fd-align-items-center:nth-child(2) span"
|
living_area: ".object-kenmerken-list:nth-child(8) .fd-align-items-center:nth-child(2) span"
|
||||||
|
@ -165,8 +165,8 @@ def preprocess_data(df: pd.DataFrame, is_past: bool) -> pd.DataFrame:
|
|||||||
keep_cols_sold = keep_cols + config.keep_cols.sold_data
|
keep_cols_sold = keep_cols + config.keep_cols.sold_data
|
||||||
|
|
||||||
# Info
|
# Info
|
||||||
df["house_id"] = df["url"].apply(lambda x: int(x.split("/")[-2].split("-")[1]))
|
df["house_id"] = df["url"].apply(lambda x: int(x.split("/")[-2]))
|
||||||
df["house_type"] = df["url"].apply(lambda x: x.split("/")[-2].split("-")[0])
|
df["house_type"] = df["url"].apply(lambda x: x.split("/")[-3].split("-")[0])
|
||||||
df = df[df["house_type"].isin(["appartement", "huis"])]
|
df = df[df["house_type"].isin(["appartement", "huis"])]
|
||||||
|
|
||||||
# Price
|
# Price
|
||||||
|
Binary file not shown.
@ -23,8 +23,9 @@ saved_locations = []
|
|||||||
out = []
|
out = []
|
||||||
|
|
||||||
# URL = "https://www.funda.nl/zoeken/koop?selected_area=%5B%22utrecht,15km%22%5D&price=%22-400000%22&object_type=%5B%22house%22%5D"
|
# URL = "https://www.funda.nl/zoeken/koop?selected_area=%5B%22utrecht,15km%22%5D&price=%22-400000%22&object_type=%5B%22house%22%5D"
|
||||||
URL = "https://www.funda.nl/zoeken/koop?selected_area=%5B%22utrecht,30km%22%5D&price=%22-500000%22&object_type=%5B%22house%22%5D"
|
#URL = "https://www.funda.nl/zoeken/koop?selected_area=%5B%22utrecht,30km%22%5D&price=%22-500000%22&object_type=%5B%22house%22%5D"
|
||||||
NUM_PAGES = 150
|
URL = "https://www.funda.nl/zoeken/koop?selected_area=%5B%22utrecht,30km%22%5D&price=%22-500000%22&object_type=%5B%22house%22%5D&availability=%5B%22available%22%5D"
|
||||||
|
NUM_PAGES = 1 # 150
|
||||||
def get_funda_data():
|
def get_funda_data():
|
||||||
# scraper = FundaScraper(url="nijkerk/beschikbaar/100000-400000/woonhuis/tuin/eengezinswoning/landhuis/+30km/", find_past=False, n_pages=81)
|
# scraper = FundaScraper(url="nijkerk/beschikbaar/100000-400000/woonhuis/tuin/eengezinswoning/landhuis/+30km/", find_past=False, n_pages=81)
|
||||||
scraper = FundaScraper(url=URL, find_past=False, n_pages=NUM_PAGES)
|
scraper = FundaScraper(url=URL, find_past=False, n_pages=NUM_PAGES)
|
||||||
|
Loading…
Reference in New Issue
Block a user