usse/scrape/usse.py

106 lines
3.9 KiB
Python
Raw Permalink Normal View History

2023-12-22 14:26:01 +00:00
import geopy
# from funda_scraper import FundaScraper
from funda_scraper.scrape import FundaScraper
2023-12-27 16:36:36 +00:00
import datetime, os, pickle, json, tqdm, numpy
2023-12-28 15:43:53 +00:00
import osrm, random
2023-12-22 14:26:01 +00:00
2023-12-27 16:36:36 +00:00
OSRM_HOST = 'https://osrm.herreweb.nl'
2023-12-22 14:26:01 +00:00
NOMINATIM_HOST = 'geocode.herreweb.nl'
osrm_c = osrm.Client(host=OSRM_HOST)
nomi_c = geopy.Nominatim(domain=NOMINATIM_HOST, user_agent="Project Usse", scheme="https")
# Define locations to calculate distances from
ORIGIN_LOCATIONS = {
"nfi_location" : (4.3585175985355225, 52.044867266861466),
"hoogstraat_location" : (5.147180442716177, 52.08725689123654),
"bakkersdijk_location" : (4.482033956202426, 51.85802695253161),
"korhoen_location" : (5.732514040876346, 52.5219455005375),
"harde_location" : (5.870995170999243, 52.41650138296019)
}
2023-12-28 15:26:57 +00:00
saved_locations = []
2023-12-22 14:26:01 +00:00
out = []
2024-03-31 16:01:38 +00:00
# URL = "https://www.funda.nl/zoeken/koop?selected_area=%5B%22utrecht,15km%22%5D&price=%22-400000%22&object_type=%5B%22house%22%5D"
2024-08-24 16:52:50 +00:00
#URL = "https://www.funda.nl/zoeken/koop?selected_area=%5B%22utrecht,30km%22%5D&price=%22-500000%22&object_type=%5B%22house%22%5D"
URL = "https://www.funda.nl/zoeken/koop?selected_area=%5B%22utrecht,30km%22%5D&price=%22-500000%22&object_type=%5B%22house%22%5D&availability=%5B%22available%22%5D"
NUM_PAGES = 1 # 150
2023-12-22 14:26:01 +00:00
def get_funda_data():
# scraper = FundaScraper(url="nijkerk/beschikbaar/100000-400000/woonhuis/tuin/eengezinswoning/landhuis/+30km/", find_past=False, n_pages=81)
2024-04-02 20:20:30 +00:00
scraper = FundaScraper(url=URL, find_past=False, n_pages=NUM_PAGES)
2023-12-22 14:26:01 +00:00
df = scraper.run()
return df
def get_distances(out_dict, destination_location):
for key in ORIGIN_LOCATIONS:
distance = osrm_c.route(coordinates=[destination_location, ORIGIN_LOCATIONS[key]])
out_dict[key] = distance['routes'][0]
def generate_json(houses):
2024-04-02 20:20:30 +00:00
print("Gecoding distances...")
2023-12-22 14:26:01 +00:00
count = 0
for i in tqdm.tqdm(range(len(houses))):
count += 1
out_dict = {}
zip_code = houses.zip.get(i)
if zip_code == "na":
continue
# address_l = f"{houses.zip.get(i)}".split(" ")
2023-12-28 15:11:23 +00:00
address = f"{houses.address.get(i)} {houses.city.get(i)}"
2023-12-22 14:26:01 +00:00
2023-12-28 15:43:53 +00:00
if "lindenlaan" in address.lower():
pass
2023-12-22 14:26:01 +00:00
res = nomi_c.geocode(address)
# res = gmaps.geocode(f"{address}")
if res == None:
2023-12-28 15:43:53 +00:00
address = f"{houses.city.get(i)}"
res = nomi_c.geocode(address)
if res == None:
print(f"{i}:Failed to get any loction for: {address}")
continue
2023-12-22 14:26:01 +00:00
destination_location = res.point
destination_location = [destination_location.longitude, destination_location.latitude]
2023-12-28 15:26:57 +00:00
if destination_location in saved_locations:
print(f"double for: {address}")
2023-12-28 15:43:53 +00:00
destination_location[0] = destination_location[0] + random.randint(1, 10) / 10000
destination_location[1] = destination_location[1] + random.randint(1, 10) / 10000
2023-12-28 15:26:57 +00:00
saved_locations.append(destination_location)
2023-12-22 14:26:01 +00:00
out_dict['name'] = f"{address}_{count}" # Fix for duplicate names in dictionary.
out_dict['position'] = destination_location
2023-12-27 21:45:02 +00:00
2023-12-27 16:36:36 +00:00
2023-12-22 14:26:01 +00:00
for key in houses.keys():
if key == "descrip" or key == "photo":
continue
2023-12-22 14:26:01 +00:00
out_dict[key] = houses.__getattr__(key).get(i)
get_distances(out_dict, destination_location)
2023-12-27 16:36:36 +00:00
# Convert numpy data to dumpable python data
for key in out_dict:
if type(out_dict[key]) == numpy.int64:
out_dict[key] = int(out_dict[key])
2023-12-22 14:26:01 +00:00
out.append(out_dict)
final = json.dumps(out, indent=2)
out_f = open('out.json', 'w')
out_f.writelines(final)
if __name__ == "__main__":
if os.path.exists('panda_dump.bin'):
data = pickle.load(open('panda_dump.bin', 'rb'))
else:
data = get_funda_data()
pickle.dump(data, open('panda_dump.bin', 'wb'))
2024-04-15 19:31:08 +00:00
generate_json(data)