import geopy # from funda_scraper import FundaScraper from funda_scraper.scrape import FundaScraper import datetime, os, pickle, json, tqdm, numpy import osrm, random OSRM_HOST = 'https://osrm.herreweb.nl' NOMINATIM_HOST = 'geocode.herreweb.nl' osrm_c = osrm.Client(host=OSRM_HOST) nomi_c = geopy.Nominatim(domain=NOMINATIM_HOST, user_agent="Project Usse", scheme="https") # Define locations to calculate distances from ORIGIN_LOCATIONS = { "nfi_location" : (4.3585175985355225, 52.044867266861466), "hoogstraat_location" : (5.147180442716177, 52.08725689123654), "bakkersdijk_location" : (4.482033956202426, 51.85802695253161), "korhoen_location" : (5.732514040876346, 52.5219455005375), "harde_location" : (5.870995170999243, 52.41650138296019) } saved_locations = [] out = [] # URL = "https://www.funda.nl/zoeken/koop?selected_area=%5B%22utrecht,15km%22%5D&price=%22-400000%22&object_type=%5B%22house%22%5D" URL = "https://www.funda.nl/zoeken/koop?selected_area=%5B%22utrecht,30km%22%5D&price=%22-500000%22&object_type=%5B%22house%22%5D" def get_funda_data(): # scraper = FundaScraper(url="nijkerk/beschikbaar/100000-400000/woonhuis/tuin/eengezinswoning/landhuis/+30km/", find_past=False, n_pages=81) scraper = FundaScraper(url=URL, find_past=False, n_pages=150) df = scraper.run() return df def get_distances(out_dict, destination_location): for key in ORIGIN_LOCATIONS: distance = osrm_c.route(coordinates=[destination_location, ORIGIN_LOCATIONS[key]]) out_dict[key] = distance['routes'][0] def generate_json(houses): count = 0 for i in tqdm.tqdm(range(len(houses))): count += 1 out_dict = {} zip_code = houses.zip.get(i) if zip_code == "na": continue # address_l = f"{houses.zip.get(i)}".split(" ") address = f"{houses.address.get(i)} {houses.city.get(i)}" if "lindenlaan" in address.lower(): pass res = nomi_c.geocode(address) # res = gmaps.geocode(f"{address}") if res == None: address = f"{houses.city.get(i)}" res = nomi_c.geocode(address) if res == None: print(f"{i}:Failed to get any loction for: {address}") continue destination_location = res.point destination_location = [destination_location.longitude, destination_location.latitude] if destination_location in saved_locations: print(f"double for: {address}") destination_location[0] = destination_location[0] + random.randint(1, 10) / 10000 destination_location[1] = destination_location[1] + random.randint(1, 10) / 10000 saved_locations.append(destination_location) out_dict['name'] = f"{address}_{count}" # Fix for duplicate names in dictionary. out_dict['position'] = destination_location for key in houses.keys(): if key == "descrip" or key == "photo": continue out_dict[key] = houses.__getattr__(key).get(i) get_distances(out_dict, destination_location) # Convert numpy data to dumpable python data for key in out_dict: if type(out_dict[key]) == numpy.int64: out_dict[key] = int(out_dict[key]) out.append(out_dict) final = json.dumps(out, indent=2) out_f = open('out.json', 'w') out_f.writelines(final) if __name__ == "__main__": if os.path.exists('panda_dump.bin'): data = pickle.load(open('panda_dump.bin', 'rb')) else: data = get_funda_data() pickle.dump(data, open('panda_dump.bin', 'wb')) generate_json(data)