usse/funda-scraper/venv/lib/python3.10/site-packages/geopy/extra/rate_limiter.py

405 lines
14 KiB
Python
Raw Normal View History

2023-02-20 22:38:24 +00:00
""":class:`.RateLimiter` and :class:`.AsyncRateLimiter` allow to perform bulk
operations while gracefully handling error responses and adding delays
when needed.
In the example below a delay of 1 second (``min_delay_seconds=1``)
will be added between each pair of ``geolocator.geocode`` calls; all
:class:`geopy.exc.GeocoderServiceError` exceptions will be retried
(up to ``max_retries`` times)::
import pandas as pd
df = pd.DataFrame({'name': ['paris', 'berlin', 'london']})
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="specify_your_app_name_here")
from geopy.extra.rate_limiter import RateLimiter
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)
df['location'] = df['name'].apply(geocode)
df['point'] = df['location'].apply(lambda loc: tuple(loc.point) if loc else None)
This would produce the following DataFrame::
>>> df
name location \\
0 paris (Paris, Île-de-France, France métropolitaine, ...
1 berlin (Berlin, 10117, Deutschland, (52.5170365, 13.3...
2 london (London, Greater London, England, SW1A 2DU, UK...
point
0 (48.8566101, 2.3514992, 0.0)
1 (52.5170365, 13.3888599, 0.0)
2 (51.5073219, -0.1276474, 0.0)
To pass extra options to the `geocode` call::
from functools import partial
df['location'] = df['name'].apply(partial(geocode, language='de'))
To see a progress bar::
from tqdm import tqdm
tqdm.pandas()
df['location'] = df['name'].progress_apply(geocode)
Before using rate limiting classes, please consult with the Geocoding
service ToS, which might explicitly consider bulk requests (even throttled)
a violation.
"""
import asyncio
import inspect
import threading
from itertools import chain, count
from time import sleep
from timeit import default_timer
from geopy.exc import GeocoderServiceError
from geopy.util import logger
__all__ = ("AsyncRateLimiter", "RateLimiter")
def _is_last_gen(count):
"""list(_is_last_gen(2)) -> [False, False, True]"""
return chain((False for _ in range(count)), [True])
class BaseRateLimiter:
"""Base Rate Limiter class for both sync and async versions."""
_retry_exceptions = (GeocoderServiceError,)
def __init__(
self,
*,
min_delay_seconds,
max_retries,
swallow_exceptions,
return_value_on_exception
):
self.min_delay_seconds = min_delay_seconds
self.max_retries = max_retries
self.swallow_exceptions = swallow_exceptions
self.return_value_on_exception = return_value_on_exception
assert max_retries >= 0
# State:
self._lock = threading.Lock()
self._last_call = None
def _clock(self): # pragma: no cover
return default_timer()
def _acquire_request_slot_gen(self):
# Requests rate is limited by `min_delay_seconds` interval.
#
# Imagine the time axis as a grid with `min_delay_seconds` step,
# where we would call each step as a "request slot". RateLimiter
# guarantees that each "request slot" contains at most 1 request.
#
# Note that actual requests might take longer time than
# `min_delay_seconds`. In that case you might want to consider
# parallelizing requests (with a ThreadPool for sync mode and
# asyncio tasks for async), to keep the requests rate closer
# to `min_delay_seconds`.
#
# This generator thread-safely acquires a "request slot", and
# if it fails to do that at this time, it yields the amount
# of seconds to sleep until the next attempt. The generator
# stops only when the "request slot" has been successfully
# acquired.
#
# There's no ordering between the concurrent requests. The first
# request to acquire the lock wins the next "request slot".
while True:
with self._lock:
clock = self._clock()
if self._last_call is None:
# A first iteration -- start immediately.
self._last_call = clock
return
seconds_since_last_call = clock - self._last_call
wait = self.min_delay_seconds - seconds_since_last_call
if wait <= 0:
# A successfully acquired request slot.
self._last_call = clock
return
# Couldn't acquire a request slot. Wait until the beginning
# of the next slot to try again.
yield wait
def _retries_gen(self, args, kwargs):
for i, is_last_try in zip(count(), _is_last_gen(self.max_retries)):
try:
yield i # Run the function.
except self._retry_exceptions:
if is_last_try:
yield True # The exception should be raised
else:
logger.warning(
type(self).__name__ + " caught an error, retrying "
"(%s/%s tries). Called with (*%r, **%r).",
i,
self.max_retries,
args,
kwargs,
exc_info=True,
)
yield False # The exception has been swallowed.
continue
else:
# A successful run -- stop retrying:
return # pragma: no cover
def _handle_exc(self, args, kwargs):
if self.swallow_exceptions:
logger.warning(
type(self).__name__ + " swallowed an error after %r retries. "
"Called with (*%r, **%r).",
self.max_retries,
args,
kwargs,
exc_info=True,
)
return self.return_value_on_exception
else:
raise
class RateLimiter(BaseRateLimiter):
"""This is a Rate Limiter implementation for synchronous functions
(like geocoders with the default :class:`geopy.adapters.BaseSyncAdapter`).
Examples::
from geopy.extra.rate_limiter import RateLimiter
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="specify_your_app_name_here")
search = ["moscow", "paris", "berlin", "tokyo", "beijing"]
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)
locations = [geocode(s) for s in search]
search = [
(55.47, 37.32), (48.85, 2.35), (52.51, 13.38),
(34.69, 139.40), (39.90, 116.39)
]
reverse = RateLimiter(geolocator.reverse, min_delay_seconds=1)
locations = [reverse(s) for s in search]
RateLimiter class is thread-safe. If geocoding service's responses
are slower than `min_delay_seconds`, then you can benefit from
parallelizing the work::
import concurrent.futures
geolocator = OpenMapQuest(api_key="...")
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1/20)
with concurrent.futures.ThreadPoolExecutor() as e:
locations = list(e.map(geocode, search))
.. versionchanged:: 2.0
Added thread-safety support.
"""
def __init__(
self,
func,
*,
min_delay_seconds=0.0,
max_retries=2,
error_wait_seconds=5.0,
swallow_exceptions=True,
return_value_on_exception=None
):
"""
:param callable func:
A function which should be wrapped by the rate limiter.
:param float min_delay_seconds:
Minimum delay in seconds between the wrapped ``func`` calls.
To convert :abbr:`RPS (Requests Per Second)` rate to
``min_delay_seconds`` you need to divide 1 by RPS. For example,
if you need to keep the rate at 20 RPS, you can use
``min_delay_seconds=1/20``.
:param int max_retries:
Number of retries on exceptions. Only
:class:`geopy.exc.GeocoderServiceError` exceptions are
retried -- others are always re-raised. ``max_retries + 1``
requests would be performed at max per query. Set
``max_retries=0`` to disable retries.
:param float error_wait_seconds:
Time to wait between retries after errors. Must be
greater or equal to ``min_delay_seconds``.
:param bool swallow_exceptions:
Should an exception be swallowed after retries? If not,
it will be re-raised. If yes, the ``return_value_on_exception``
will be returned.
:param return_value_on_exception:
Value to return on failure when ``swallow_exceptions=True``.
"""
super().__init__(
min_delay_seconds=min_delay_seconds,
max_retries=max_retries,
swallow_exceptions=swallow_exceptions,
return_value_on_exception=return_value_on_exception,
)
self.func = func
self.error_wait_seconds = error_wait_seconds
assert error_wait_seconds >= min_delay_seconds
assert max_retries >= 0
def _sleep(self, seconds): # pragma: no cover
logger.debug(type(self).__name__ + " sleep(%r)", seconds)
sleep(seconds)
def _acquire_request_slot(self):
for wait in self._acquire_request_slot_gen():
self._sleep(wait)
def __call__(self, *args, **kwargs):
gen = self._retries_gen(args, kwargs)
for _ in gen:
self._acquire_request_slot()
try:
res = self.func(*args, **kwargs)
if inspect.isawaitable(res):
raise ValueError(
"An async awaitable has been passed to `RateLimiter`. "
"Use `AsyncRateLimiter` instead, which supports awaitables."
)
return res
except self._retry_exceptions as e:
if gen.throw(e):
# A final try
return self._handle_exc(args, kwargs)
self._sleep(self.error_wait_seconds)
raise RuntimeError("Should not have been reached") # pragma: no cover
class AsyncRateLimiter(BaseRateLimiter):
"""This is a Rate Limiter implementation for asynchronous functions
(like geocoders with :class:`geopy.adapters.BaseAsyncAdapter`).
Examples::
from geopy.adapters import AioHTTPAdapter
from geopy.extra.rate_limiter import AsyncRateLimiter
from geopy.geocoders import Nominatim
async with Nominatim(
user_agent="specify_your_app_name_here",
adapter_factory=AioHTTPAdapter,
) as geolocator:
search = ["moscow", "paris", "berlin", "tokyo", "beijing"]
geocode = AsyncRateLimiter(geolocator.geocode, min_delay_seconds=1)
locations = [await geocode(s) for s in search]
search = [
(55.47, 37.32), (48.85, 2.35), (52.51, 13.38),
(34.69, 139.40), (39.90, 116.39)
]
reverse = AsyncRateLimiter(geolocator.reverse, min_delay_seconds=1)
locations = [await reverse(s) for s in search]
AsyncRateLimiter class is safe to use across multiple concurrent tasks.
If geocoding service's responses are slower than `min_delay_seconds`,
then you can benefit from parallelizing the work::
import asyncio
async with OpenMapQuest(
api_key="...", adapter_factory=AioHTTPAdapter
) as geolocator:
geocode = AsyncRateLimiter(geolocator.geocode, min_delay_seconds=1/20)
locations = await asyncio.gather(*(geocode(s) for s in search))
.. versionadded:: 2.0
"""
def __init__(
self,
func,
*,
min_delay_seconds=0.0,
max_retries=2,
error_wait_seconds=5.0,
swallow_exceptions=True,
return_value_on_exception=None
):
"""
:param callable func:
A function which should be wrapped by the rate limiter.
:param float min_delay_seconds:
Minimum delay in seconds between the wrapped ``func`` calls.
To convert :abbr:`RPS (Requests Per Second)` rate to
``min_delay_seconds`` you need to divide 1 by RPS. For example,
if you need to keep the rate at 20 RPS, you can use
``min_delay_seconds=1/20``.
:param int max_retries:
Number of retries on exceptions. Only
:class:`geopy.exc.GeocoderServiceError` exceptions are
retried -- others are always re-raised. ``max_retries + 1``
requests would be performed at max per query. Set
``max_retries=0`` to disable retries.
:param float error_wait_seconds:
Time to wait between retries after errors. Must be
greater or equal to ``min_delay_seconds``.
:param bool swallow_exceptions:
Should an exception be swallowed after retries? If not,
it will be re-raised. If yes, the ``return_value_on_exception``
will be returned.
:param return_value_on_exception:
Value to return on failure when ``swallow_exceptions=True``.
"""
super().__init__(
min_delay_seconds=min_delay_seconds,
max_retries=max_retries,
swallow_exceptions=swallow_exceptions,
return_value_on_exception=return_value_on_exception,
)
self.func = func
self.error_wait_seconds = error_wait_seconds
assert error_wait_seconds >= min_delay_seconds
assert max_retries >= 0
async def _sleep(self, seconds): # pragma: no cover
logger.debug(type(self).__name__ + " sleep(%r)", seconds)
await asyncio.sleep(seconds)
async def _acquire_request_slot(self):
for wait in self._acquire_request_slot_gen():
await self._sleep(wait)
async def __call__(self, *args, **kwargs):
gen = self._retries_gen(args, kwargs)
for _ in gen:
await self._acquire_request_slot()
try:
return await self.func(*args, **kwargs)
except self._retry_exceptions as e:
if gen.throw(e):
# A final try
return self._handle_exc(args, kwargs)
await self._sleep(self.error_wait_seconds)
raise RuntimeError("Should not have been reached") # pragma: no cover