405 lines
14 KiB
Python
405 lines
14 KiB
Python
""":class:`.RateLimiter` and :class:`.AsyncRateLimiter` allow to perform bulk
|
|
operations while gracefully handling error responses and adding delays
|
|
when needed.
|
|
|
|
In the example below a delay of 1 second (``min_delay_seconds=1``)
|
|
will be added between each pair of ``geolocator.geocode`` calls; all
|
|
:class:`geopy.exc.GeocoderServiceError` exceptions will be retried
|
|
(up to ``max_retries`` times)::
|
|
|
|
import pandas as pd
|
|
df = pd.DataFrame({'name': ['paris', 'berlin', 'london']})
|
|
|
|
from geopy.geocoders import Nominatim
|
|
geolocator = Nominatim(user_agent="specify_your_app_name_here")
|
|
|
|
from geopy.extra.rate_limiter import RateLimiter
|
|
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)
|
|
df['location'] = df['name'].apply(geocode)
|
|
|
|
df['point'] = df['location'].apply(lambda loc: tuple(loc.point) if loc else None)
|
|
|
|
This would produce the following DataFrame::
|
|
|
|
>>> df
|
|
name location \\
|
|
0 paris (Paris, Île-de-France, France métropolitaine, ...
|
|
1 berlin (Berlin, 10117, Deutschland, (52.5170365, 13.3...
|
|
2 london (London, Greater London, England, SW1A 2DU, UK...
|
|
|
|
point
|
|
0 (48.8566101, 2.3514992, 0.0)
|
|
1 (52.5170365, 13.3888599, 0.0)
|
|
2 (51.5073219, -0.1276474, 0.0)
|
|
|
|
To pass extra options to the `geocode` call::
|
|
|
|
from functools import partial
|
|
df['location'] = df['name'].apply(partial(geocode, language='de'))
|
|
|
|
To see a progress bar::
|
|
|
|
from tqdm import tqdm
|
|
tqdm.pandas()
|
|
df['location'] = df['name'].progress_apply(geocode)
|
|
|
|
Before using rate limiting classes, please consult with the Geocoding
|
|
service ToS, which might explicitly consider bulk requests (even throttled)
|
|
a violation.
|
|
"""
|
|
|
|
import asyncio
|
|
import inspect
|
|
import threading
|
|
from itertools import chain, count
|
|
from time import sleep
|
|
from timeit import default_timer
|
|
|
|
from geopy.exc import GeocoderServiceError
|
|
from geopy.util import logger
|
|
|
|
__all__ = ("AsyncRateLimiter", "RateLimiter")
|
|
|
|
|
|
def _is_last_gen(count):
|
|
"""list(_is_last_gen(2)) -> [False, False, True]"""
|
|
return chain((False for _ in range(count)), [True])
|
|
|
|
|
|
class BaseRateLimiter:
|
|
"""Base Rate Limiter class for both sync and async versions."""
|
|
|
|
_retry_exceptions = (GeocoderServiceError,)
|
|
|
|
def __init__(
|
|
self,
|
|
*,
|
|
min_delay_seconds,
|
|
max_retries,
|
|
swallow_exceptions,
|
|
return_value_on_exception
|
|
):
|
|
self.min_delay_seconds = min_delay_seconds
|
|
self.max_retries = max_retries
|
|
self.swallow_exceptions = swallow_exceptions
|
|
self.return_value_on_exception = return_value_on_exception
|
|
assert max_retries >= 0
|
|
|
|
# State:
|
|
self._lock = threading.Lock()
|
|
self._last_call = None
|
|
|
|
def _clock(self): # pragma: no cover
|
|
return default_timer()
|
|
|
|
def _acquire_request_slot_gen(self):
|
|
# Requests rate is limited by `min_delay_seconds` interval.
|
|
#
|
|
# Imagine the time axis as a grid with `min_delay_seconds` step,
|
|
# where we would call each step as a "request slot". RateLimiter
|
|
# guarantees that each "request slot" contains at most 1 request.
|
|
#
|
|
# Note that actual requests might take longer time than
|
|
# `min_delay_seconds`. In that case you might want to consider
|
|
# parallelizing requests (with a ThreadPool for sync mode and
|
|
# asyncio tasks for async), to keep the requests rate closer
|
|
# to `min_delay_seconds`.
|
|
#
|
|
# This generator thread-safely acquires a "request slot", and
|
|
# if it fails to do that at this time, it yields the amount
|
|
# of seconds to sleep until the next attempt. The generator
|
|
# stops only when the "request slot" has been successfully
|
|
# acquired.
|
|
#
|
|
# There's no ordering between the concurrent requests. The first
|
|
# request to acquire the lock wins the next "request slot".
|
|
while True:
|
|
with self._lock:
|
|
clock = self._clock()
|
|
if self._last_call is None:
|
|
# A first iteration -- start immediately.
|
|
self._last_call = clock
|
|
return
|
|
seconds_since_last_call = clock - self._last_call
|
|
wait = self.min_delay_seconds - seconds_since_last_call
|
|
if wait <= 0:
|
|
# A successfully acquired request slot.
|
|
self._last_call = clock
|
|
return
|
|
# Couldn't acquire a request slot. Wait until the beginning
|
|
# of the next slot to try again.
|
|
yield wait
|
|
|
|
def _retries_gen(self, args, kwargs):
|
|
for i, is_last_try in zip(count(), _is_last_gen(self.max_retries)):
|
|
try:
|
|
yield i # Run the function.
|
|
except self._retry_exceptions:
|
|
if is_last_try:
|
|
yield True # The exception should be raised
|
|
else:
|
|
logger.warning(
|
|
type(self).__name__ + " caught an error, retrying "
|
|
"(%s/%s tries). Called with (*%r, **%r).",
|
|
i,
|
|
self.max_retries,
|
|
args,
|
|
kwargs,
|
|
exc_info=True,
|
|
)
|
|
yield False # The exception has been swallowed.
|
|
continue
|
|
else:
|
|
# A successful run -- stop retrying:
|
|
return # pragma: no cover
|
|
|
|
def _handle_exc(self, args, kwargs):
|
|
if self.swallow_exceptions:
|
|
logger.warning(
|
|
type(self).__name__ + " swallowed an error after %r retries. "
|
|
"Called with (*%r, **%r).",
|
|
self.max_retries,
|
|
args,
|
|
kwargs,
|
|
exc_info=True,
|
|
)
|
|
return self.return_value_on_exception
|
|
else:
|
|
raise
|
|
|
|
|
|
class RateLimiter(BaseRateLimiter):
|
|
"""This is a Rate Limiter implementation for synchronous functions
|
|
(like geocoders with the default :class:`geopy.adapters.BaseSyncAdapter`).
|
|
|
|
Examples::
|
|
|
|
from geopy.extra.rate_limiter import RateLimiter
|
|
from geopy.geocoders import Nominatim
|
|
|
|
geolocator = Nominatim(user_agent="specify_your_app_name_here")
|
|
|
|
search = ["moscow", "paris", "berlin", "tokyo", "beijing"]
|
|
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)
|
|
locations = [geocode(s) for s in search]
|
|
|
|
search = [
|
|
(55.47, 37.32), (48.85, 2.35), (52.51, 13.38),
|
|
(34.69, 139.40), (39.90, 116.39)
|
|
]
|
|
reverse = RateLimiter(geolocator.reverse, min_delay_seconds=1)
|
|
locations = [reverse(s) for s in search]
|
|
|
|
RateLimiter class is thread-safe. If geocoding service's responses
|
|
are slower than `min_delay_seconds`, then you can benefit from
|
|
parallelizing the work::
|
|
|
|
import concurrent.futures
|
|
|
|
geolocator = OpenMapQuest(api_key="...")
|
|
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1/20)
|
|
|
|
with concurrent.futures.ThreadPoolExecutor() as e:
|
|
locations = list(e.map(geocode, search))
|
|
|
|
.. versionchanged:: 2.0
|
|
Added thread-safety support.
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
func,
|
|
*,
|
|
min_delay_seconds=0.0,
|
|
max_retries=2,
|
|
error_wait_seconds=5.0,
|
|
swallow_exceptions=True,
|
|
return_value_on_exception=None
|
|
):
|
|
"""
|
|
:param callable func:
|
|
A function which should be wrapped by the rate limiter.
|
|
|
|
:param float min_delay_seconds:
|
|
Minimum delay in seconds between the wrapped ``func`` calls.
|
|
To convert :abbr:`RPS (Requests Per Second)` rate to
|
|
``min_delay_seconds`` you need to divide 1 by RPS. For example,
|
|
if you need to keep the rate at 20 RPS, you can use
|
|
``min_delay_seconds=1/20``.
|
|
|
|
:param int max_retries:
|
|
Number of retries on exceptions. Only
|
|
:class:`geopy.exc.GeocoderServiceError` exceptions are
|
|
retried -- others are always re-raised. ``max_retries + 1``
|
|
requests would be performed at max per query. Set
|
|
``max_retries=0`` to disable retries.
|
|
|
|
:param float error_wait_seconds:
|
|
Time to wait between retries after errors. Must be
|
|
greater or equal to ``min_delay_seconds``.
|
|
|
|
:param bool swallow_exceptions:
|
|
Should an exception be swallowed after retries? If not,
|
|
it will be re-raised. If yes, the ``return_value_on_exception``
|
|
will be returned.
|
|
|
|
:param return_value_on_exception:
|
|
Value to return on failure when ``swallow_exceptions=True``.
|
|
|
|
"""
|
|
super().__init__(
|
|
min_delay_seconds=min_delay_seconds,
|
|
max_retries=max_retries,
|
|
swallow_exceptions=swallow_exceptions,
|
|
return_value_on_exception=return_value_on_exception,
|
|
)
|
|
self.func = func
|
|
self.error_wait_seconds = error_wait_seconds
|
|
assert error_wait_seconds >= min_delay_seconds
|
|
assert max_retries >= 0
|
|
|
|
def _sleep(self, seconds): # pragma: no cover
|
|
logger.debug(type(self).__name__ + " sleep(%r)", seconds)
|
|
sleep(seconds)
|
|
|
|
def _acquire_request_slot(self):
|
|
for wait in self._acquire_request_slot_gen():
|
|
self._sleep(wait)
|
|
|
|
def __call__(self, *args, **kwargs):
|
|
gen = self._retries_gen(args, kwargs)
|
|
for _ in gen:
|
|
self._acquire_request_slot()
|
|
try:
|
|
res = self.func(*args, **kwargs)
|
|
if inspect.isawaitable(res):
|
|
raise ValueError(
|
|
"An async awaitable has been passed to `RateLimiter`. "
|
|
"Use `AsyncRateLimiter` instead, which supports awaitables."
|
|
)
|
|
return res
|
|
except self._retry_exceptions as e:
|
|
if gen.throw(e):
|
|
# A final try
|
|
return self._handle_exc(args, kwargs)
|
|
self._sleep(self.error_wait_seconds)
|
|
|
|
raise RuntimeError("Should not have been reached") # pragma: no cover
|
|
|
|
|
|
class AsyncRateLimiter(BaseRateLimiter):
|
|
"""This is a Rate Limiter implementation for asynchronous functions
|
|
(like geocoders with :class:`geopy.adapters.BaseAsyncAdapter`).
|
|
|
|
Examples::
|
|
|
|
from geopy.adapters import AioHTTPAdapter
|
|
from geopy.extra.rate_limiter import AsyncRateLimiter
|
|
from geopy.geocoders import Nominatim
|
|
|
|
async with Nominatim(
|
|
user_agent="specify_your_app_name_here",
|
|
adapter_factory=AioHTTPAdapter,
|
|
) as geolocator:
|
|
|
|
search = ["moscow", "paris", "berlin", "tokyo", "beijing"]
|
|
geocode = AsyncRateLimiter(geolocator.geocode, min_delay_seconds=1)
|
|
locations = [await geocode(s) for s in search]
|
|
|
|
search = [
|
|
(55.47, 37.32), (48.85, 2.35), (52.51, 13.38),
|
|
(34.69, 139.40), (39.90, 116.39)
|
|
]
|
|
reverse = AsyncRateLimiter(geolocator.reverse, min_delay_seconds=1)
|
|
locations = [await reverse(s) for s in search]
|
|
|
|
AsyncRateLimiter class is safe to use across multiple concurrent tasks.
|
|
If geocoding service's responses are slower than `min_delay_seconds`,
|
|
then you can benefit from parallelizing the work::
|
|
|
|
import asyncio
|
|
|
|
async with OpenMapQuest(
|
|
api_key="...", adapter_factory=AioHTTPAdapter
|
|
) as geolocator:
|
|
|
|
geocode = AsyncRateLimiter(geolocator.geocode, min_delay_seconds=1/20)
|
|
locations = await asyncio.gather(*(geocode(s) for s in search))
|
|
|
|
.. versionadded:: 2.0
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
func,
|
|
*,
|
|
min_delay_seconds=0.0,
|
|
max_retries=2,
|
|
error_wait_seconds=5.0,
|
|
swallow_exceptions=True,
|
|
return_value_on_exception=None
|
|
):
|
|
"""
|
|
:param callable func:
|
|
A function which should be wrapped by the rate limiter.
|
|
|
|
:param float min_delay_seconds:
|
|
Minimum delay in seconds between the wrapped ``func`` calls.
|
|
To convert :abbr:`RPS (Requests Per Second)` rate to
|
|
``min_delay_seconds`` you need to divide 1 by RPS. For example,
|
|
if you need to keep the rate at 20 RPS, you can use
|
|
``min_delay_seconds=1/20``.
|
|
|
|
:param int max_retries:
|
|
Number of retries on exceptions. Only
|
|
:class:`geopy.exc.GeocoderServiceError` exceptions are
|
|
retried -- others are always re-raised. ``max_retries + 1``
|
|
requests would be performed at max per query. Set
|
|
``max_retries=0`` to disable retries.
|
|
|
|
:param float error_wait_seconds:
|
|
Time to wait between retries after errors. Must be
|
|
greater or equal to ``min_delay_seconds``.
|
|
|
|
:param bool swallow_exceptions:
|
|
Should an exception be swallowed after retries? If not,
|
|
it will be re-raised. If yes, the ``return_value_on_exception``
|
|
will be returned.
|
|
|
|
:param return_value_on_exception:
|
|
Value to return on failure when ``swallow_exceptions=True``.
|
|
|
|
"""
|
|
super().__init__(
|
|
min_delay_seconds=min_delay_seconds,
|
|
max_retries=max_retries,
|
|
swallow_exceptions=swallow_exceptions,
|
|
return_value_on_exception=return_value_on_exception,
|
|
)
|
|
self.func = func
|
|
self.error_wait_seconds = error_wait_seconds
|
|
assert error_wait_seconds >= min_delay_seconds
|
|
assert max_retries >= 0
|
|
|
|
async def _sleep(self, seconds): # pragma: no cover
|
|
logger.debug(type(self).__name__ + " sleep(%r)", seconds)
|
|
await asyncio.sleep(seconds)
|
|
|
|
async def _acquire_request_slot(self):
|
|
for wait in self._acquire_request_slot_gen():
|
|
await self._sleep(wait)
|
|
|
|
async def __call__(self, *args, **kwargs):
|
|
gen = self._retries_gen(args, kwargs)
|
|
for _ in gen:
|
|
await self._acquire_request_slot()
|
|
try:
|
|
return await self.func(*args, **kwargs)
|
|
except self._retry_exceptions as e:
|
|
if gen.throw(e):
|
|
# A final try
|
|
return self._handle_exc(args, kwargs)
|
|
await self._sleep(self.error_wait_seconds)
|
|
|
|
raise RuntimeError("Should not have been reached") # pragma: no cover
|