""" Adapters are HTTP client implementations used by geocoders. Some adapters might support keep-alives, request retries, http2, persistence of Cookies, response compression and so on. Adapters should be considered an implementation detail. Most of the time you wouldn't need to know about their existence unless you want to tune HTTP client settings. .. versionadded:: 2.0 Adapters are currently provided on a `provisional basis`_. .. _provisional basis: https://docs.python.org/3/glossary.html#term-provisional-api """ import abc import asyncio import contextlib import email import json import ssl import time import warnings from socket import timeout as SocketTimeout from ssl import SSLError from urllib.error import HTTPError from urllib.parse import urlparse from urllib.request import ( HTTPSHandler, ProxyHandler, Request, URLError, build_opener, getproxies, ) from geopy.exc import ( GeocoderParseError, GeocoderServiceError, GeocoderTimedOut, GeocoderUnavailable, GeopyError, ) from geopy.util import logger try: import requests from requests.adapters import HTTPAdapter as RequestsHTTPAdapter requests_available = True except ImportError: RequestsHTTPAdapter = object requests_available = False try: import aiohttp import aiohttp.client_exceptions import yarl aiohttp_available = True except ImportError: aiohttp_available = False class AdapterHTTPError(IOError): """An exception which must be raised by adapters when an HTTP response with a non-successful status code has been received. Base Geocoder class translates this exception to an instance of :class:`geopy.exc.GeocoderServiceError`. """ def __init__(self, message, *, status_code, headers, text): """ :param str message: Standard exception message. :param int status_code: HTTP status code. :param dict headers: HTTP response readers. A mapping object with lowercased or case-insensitive keys. .. versionadded:: 2.2 :param str text: HTTP body text. """ self.status_code = status_code self.headers = headers self.text = text super().__init__(message) def get_retry_after(headers): """Return Retry-After header value in seconds. .. versionadded:: 2.2 """ # https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Retry-After # https://github.com/urllib3/urllib3/blob/1.26.4/src/urllib3/util/retry.py#L376 try: retry_after = headers['retry-after'] except KeyError: return None if not retry_after: # None, '' return None retry_after = retry_after.strip() # RFC7231 section-7.1.3: # Retry-After = HTTP-date / delay-seconds try: # Retry-After: 120 seconds = int(retry_after) except ValueError: # Retry-After: Fri, 31 Dec 1999 23:59:59 GMT retry_date_tuple = email.utils.parsedate_tz(retry_after) if retry_date_tuple is None: logger.warning('Invalid Retry-After header: %s', retry_after) return None retry_date = email.utils.mktime_tz(retry_date_tuple) seconds = retry_date - time.time() if seconds < 0: seconds = 0 return seconds class BaseAdapter(abc.ABC): """Base class for an Adapter. There are two types of adapters: - :class:`.BaseSyncAdapter` -- synchronous adapter, - :class:`.BaseAsyncAdapter` -- asynchronous (asyncio) adapter. Concrete adapter implementations must extend one of the two base adapters above. See :attr:`geopy.geocoders.options.default_adapter_factory` for details on how to specify an adapter to be used by geocoders. """ # A class attribute which tells if this Adapter's required dependencies # are installed. By default assume that all Adapters are available. is_available = True def __init__(self, *, proxies, ssl_context): """Initialize adapter. :param dict proxies: An urllib-style proxies dict, e.g. ``{"http": "192.0.2.0:8080", "https": "192.0.2.0:8080"}``, ``{"https": "http://user:passw0rd@192.0.2.0:8080""}``. See :attr:`geopy.geocoders.options.default_proxies` (note that Adapters always receive a dict: the string proxy is transformed to dict in the base :class:`geopy.geocoders.base.Geocoder` class.). :type ssl_context: :class:`ssl.SSLContext` :param ssl_context: See :attr:`geopy.geocoders.options.default_ssl_context`. """ @abc.abstractmethod def get_json(self, url, *, timeout, headers): """Same as ``get_text`` except that the response is expected to be a valid JSON. The value returned is the parsed JSON. :class:`geopy.exc.GeocoderParseError` must be raised if the response cannot be parsed. :param str url: The target URL. :param float timeout: See :attr:`geopy.geocoders.options.default_timeout`. :param dict headers: A dict with custom HTTP request headers. """ @abc.abstractmethod def get_text(self, url, *, timeout, headers): """Make a GET request and return the response as string. This method should not raise any exceptions other than these: - :class:`geopy.adapters.AdapterHTTPError` should be raised if the response was successfully retrieved but the status code was non-successful. - :class:`geopy.exc.GeocoderTimedOut` should be raised when the request times out. - :class:`geopy.exc.GeocoderUnavailable` should be raised when the target host is unreachable. - :class:`geopy.exc.GeocoderServiceError` is the least specific error in the exceptions hierarchy and should be raised in any other cases. :param str url: The target URL. :param float timeout: See :attr:`geopy.geocoders.options.default_timeout`. :param dict headers: A dict with custom HTTP request headers. """ class BaseSyncAdapter(BaseAdapter): """Base class for synchronous adapters. """ def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): pass class BaseAsyncAdapter(BaseAdapter): """Base class for asynchronous adapters. See also: :ref:`Async Mode <async_mode>`. """ async def __aenter__(self): return self async def __aexit__(self, exc_type, exc_val, exc_tb): pass def _normalize_proxies(proxies): """Normalize user-supplied `proxies`: - For `None` -- retrieve System proxies using :func:`urllib.request.getproxies` - Add `http://` scheme to proxy urls if missing. """ if proxies is None: # Use system proxy settings proxies = getproxies() if not proxies: return {} # Disable proxies normalized = {} for scheme, url in proxies.items(): if url and "://" not in url: # Without the scheme there are errors: # from aiohttp: # ValueError: Only http proxies are supported # from requests (in some envs): # urllib3.exceptions.ProxySchemeUnknown: Not supported # proxy scheme localhost url = "http://%s" % url normalized[scheme] = url return normalized class URLLibAdapter(BaseSyncAdapter): """The fallback adapter which uses urllib from the Python standard library, see :func:`urllib.request.urlopen`. urllib doesn't support keep-alives, request retries, doesn't persist Cookies and is HTTP/1.1 only. urllib was the only available option for making requests in geopy 1.x, so this adapter behaves the same as geopy 1.x in terms of HTTP requests. """ def __init__(self, *, proxies, ssl_context): proxies = _normalize_proxies(proxies) super().__init__(proxies=proxies, ssl_context=ssl_context) # `ProxyHandler` should be present even when actually there're # no proxies. `build_opener` contains it anyway. By specifying # it here explicitly we can disable system proxies (i.e. # from HTTP_PROXY env var) by setting `proxies` to `{}`. # Otherwise, if we didn't specify ProxyHandler for empty # `proxies` here, the `build_opener` would have used one internally # which could have unwillingly picked up the system proxies. opener = build_opener( HTTPSHandler(context=ssl_context), ProxyHandler(proxies), ) self.urlopen = opener.open def get_json(self, url, *, timeout, headers): text = self.get_text(url, timeout=timeout, headers=headers) try: return json.loads(text) except ValueError: raise GeocoderParseError( "Could not deserialize using deserializer:\n%s" % text ) def get_text(self, url, *, timeout, headers): req = Request(url=url, headers=headers) try: page = self.urlopen(req, timeout=timeout) except Exception as error: message = str(error.args[0]) if len(error.args) else str(error) if isinstance(error, HTTPError): code = error.getcode() response_headers = { name.lower(): value for name, value in error.headers.items() } body = self._read_http_error_body(error) raise AdapterHTTPError( message, status_code=code, headers=response_headers, text=body, ) elif isinstance(error, URLError): if "timed out" in message: raise GeocoderTimedOut("Service timed out") elif "unreachable" in message: raise GeocoderUnavailable("Service not available") elif isinstance(error, SocketTimeout): raise GeocoderTimedOut("Service timed out") elif isinstance(error, SSLError): if "timed out" in message: raise GeocoderTimedOut("Service timed out") raise GeocoderServiceError(message) else: text = self._decode_page(page) status_code = page.getcode() if status_code >= 400: response_headers = { name.lower(): value for name, value in page.headers.items() } raise AdapterHTTPError( "Non-successful status code %s" % status_code, status_code=status_code, headers=response_headers, text=text, ) return text def _read_http_error_body(self, error): try: return self._decode_page(error) except Exception: logger.debug( "Unable to fetch body for a non-successful HTTP response", exc_info=True ) return None def _decode_page(self, page): encoding = page.headers.get_content_charset() or "utf-8" try: body_bytes = page.read() except Exception: raise GeocoderServiceError("Unable to read the response") try: return str(body_bytes, encoding=encoding) except ValueError: raise GeocoderParseError("Unable to decode the response bytes") class RequestsAdapter(BaseSyncAdapter): """The adapter which uses `requests`_ library. .. _requests: https://requests.readthedocs.io `requests` supports keep-alives, retries, persists Cookies, allows response compression and uses HTTP/1.1 [currently]. ``requests`` package must be installed in order to use this adapter. The requests' ``trust_env`` value is set to false, meaning that environment doesn't affect the requests' configuration. The ``ssl_context`` and ``proxies`` settings can be used for configuration. .. versionchanged:: 2.4 This adapter used to use the `certifi` CA bundle by default, if an ``ssl_context`` wasn't provided explicitly. This has been changed to use the system CA store by default. """ is_available = requests_available def __init__( self, *, proxies, ssl_context, pool_connections=10, pool_maxsize=10, max_retries=2, pool_block=False ): if not requests_available: raise ImportError( "`requests` must be installed in order to use RequestsAdapter. " "If you have installed geopy via pip, you may use " "this command to install requests: " '`pip install "geopy[requests]"`.' ) proxies = _normalize_proxies(proxies) if ssl_context is None: # By default requests uses CA bundle from `certifi` package. # This is typically overridden with the `REQUESTS_CA_BUNDLE` # environment variable. However, trust_env is disabled # below to turn off the requests-specific logic of proxy # servers configuration, which is re-implemented in geopy # so that it's similar between different Adapters implementations. # # Here, in order to align the adapter's behavior with # the default URLLibAdapter, we explicitly pass an ssl context, # which would be initialized with the system's CA store # rather than the certifi's bundle requests uses by default. # # See also https://github.com/geopy/geopy/issues/546 ssl_context = ssl.create_default_context() super().__init__(proxies=proxies, ssl_context=ssl_context) self.session = requests.Session() self.session.trust_env = False # don't use system proxies self.session.proxies = proxies self.session.mount( "http://", RequestsHTTPAdapter( pool_connections=pool_connections, pool_maxsize=pool_maxsize, max_retries=max_retries, pool_block=pool_block, ), ) self.session.mount( "https://", RequestsHTTPWithSSLContextAdapter( ssl_context=ssl_context, pool_connections=pool_connections, pool_maxsize=pool_maxsize, max_retries=max_retries, pool_block=pool_block, ), ) def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): self.session.close() def __del__(self): # Cleanup keepalive connections when Geocoder (and, thus, Adapter) # instances are getting garbage-collected. session = getattr(self, "session", None) if session is not None: try: session.close() except TypeError: # It's possible for the close method to try to fetch a # non-existent old_pool in urllib3 with a misleading state # ultimately due to stdlib queue/threading behaviour. # Since the error arises from a non-existent pool # (TypeError: 'NoneType' object is not callable) # it's safe to ignore this error pass def get_text(self, url, *, timeout, headers): resp = self._request(url, timeout=timeout, headers=headers) return resp.text def get_json(self, url, *, timeout, headers): resp = self._request(url, timeout=timeout, headers=headers) try: return resp.json() except ValueError: raise GeocoderParseError( "Could not deserialize using deserializer:\n%s" % resp.text ) def _request(self, url, *, timeout, headers): try: resp = self.session.get(url, timeout=timeout, headers=headers) except Exception as error: message = str(error) if isinstance(error, SocketTimeout): raise GeocoderTimedOut("Service timed out") elif isinstance(error, SSLError): if "timed out" in message: raise GeocoderTimedOut("Service timed out") elif isinstance(error, requests.ConnectionError): if "unauthorized" in message.lower(): raise GeocoderServiceError(message) else: raise GeocoderUnavailable(message) elif isinstance(error, requests.Timeout): raise GeocoderTimedOut("Service timed out") raise GeocoderServiceError(message) else: if resp.status_code >= 400: raise AdapterHTTPError( "Non-successful status code %s" % resp.status_code, status_code=resp.status_code, headers=resp.headers, text=resp.text, ) return resp class AioHTTPAdapter(BaseAsyncAdapter): """The adapter which uses `aiohttp`_ library. .. _aiohttp: https://docs.aiohttp.org/ `aiohttp` supports keep-alives, persists Cookies, allows response compression and uses HTTP/1.1 [currently]. ``aiohttp`` package must be installed in order to use this adapter. """ is_available = aiohttp_available def __init__(self, *, proxies, ssl_context): if not aiohttp_available: raise ImportError( "`aiohttp` must be installed in order to use AioHTTPAdapter. " "If you have installed geopy via pip, you may use " "this command to install aiohttp: " '`pip install "geopy[aiohttp]"`.' ) proxies = _normalize_proxies(proxies) super().__init__(proxies=proxies, ssl_context=ssl_context) self.proxies = proxies self.ssl_context = ssl_context @property def session(self): # Lazy session creation, which allows to avoid "unclosed socket" # warnings if a Geocoder instance is created without entering # async context and making any requests. session = self.__dict__.get("session") if session is None: session = aiohttp.ClientSession( trust_env=False, # don't use system proxies raise_for_status=False ) self.__dict__["session"] = session return session async def __aenter__(self): return self async def __aexit__(self, exc_type, exc_val, exc_tb): # Might issue a warning if loop is immediately closed: # ResourceWarning: unclosed transport <_SelectorSocketTransport fd=10> # https://github.com/aio-libs/aiohttp/issues/1115#issuecomment-242278593 # https://github.com/python/asyncio/issues/466 await self.session.close() async def get_text(self, url, *, timeout, headers): with self._normalize_exceptions(): async with self._request(url, timeout=timeout, headers=headers) as resp: await self._raise_for_status(resp) return await resp.text() async def get_json(self, url, *, timeout, headers): with self._normalize_exceptions(): async with self._request(url, timeout=timeout, headers=headers) as resp: await self._raise_for_status(resp) try: try: return await resp.json() except aiohttp.client_exceptions.ContentTypeError: # `Attempt to decode JSON with unexpected mimetype: # text/plain;charset=utf-8` return json.loads(await resp.text()) except ValueError: raise GeocoderParseError( "Could not deserialize using deserializer:\n%s" % (await resp.text()) ) async def _raise_for_status(self, resp): if resp.status >= 400: raise AdapterHTTPError( "Non-successful status code %s" % resp.status, status_code=resp.status, headers=resp.headers, text=await resp.text(), ) def _request(self, url, *, timeout, headers): if self.proxies: scheme = urlparse(url).scheme proxy = self.proxies.get(scheme.lower()) else: proxy = None # aiohttp accepts url as string or as yarl.URL. # A string url might be re-encoded by yarl, which might cause # a hashsum of params to change. Some geocoders use that # to authenticate their requests (such as Baidu SK). url = yarl.URL(url, encoded=True) # `encoded` param disables url re-encoding return self.session.get( url, timeout=timeout, headers=headers, proxy=proxy, ssl=self.ssl_context ) @contextlib.contextmanager def _normalize_exceptions(self): try: yield except (GeopyError, AdapterHTTPError, AssertionError): raise except Exception as error: message = str(error) if isinstance(error, asyncio.TimeoutError): raise GeocoderTimedOut("Service timed out") elif isinstance(error, SSLError): if "timed out" in message: raise GeocoderTimedOut("Service timed out") elif isinstance(error, aiohttp.ClientConnectionError): raise GeocoderUnavailable(message) raise GeocoderServiceError(message) # https://github.com/kennethreitz/requests/issues/3774#issuecomment-267871876 class RequestsHTTPWithSSLContextAdapter(RequestsHTTPAdapter): def __init__(self, *, ssl_context=None, **kwargs): self.__ssl_context = ssl_context self.__urllib3_warned = False super().__init__(**kwargs) def init_poolmanager(self, *args, **kwargs): if self.__ssl_context is not None: # This ssl context would get passed through the urllib3's # `PoolManager` up to the `HTTPSConnection` class. kwargs["ssl_context"] = self.__ssl_context self.__warn_if_old_urllib3() return super().init_poolmanager(*args, **kwargs) def proxy_manager_for(self, proxy, **proxy_kwargs): if self.__ssl_context is not None: proxy_kwargs["ssl_context"] = self.__ssl_context self.__warn_if_old_urllib3() return super().proxy_manager_for(proxy, **proxy_kwargs) def __warn_if_old_urllib3(self): if self.__urllib3_warned: return self.__urllib3_warned = True try: import requests.packages.urllib3 as urllib3 except ImportError: import urllib3 def silent_int(s): try: return int(s) except ValueError: return 0 version = tuple(silent_int(v) for v in urllib3.__version__.split(".")) if version < (1, 24, 2): warnings.warn( "urllib3 prior to 1.24.2 is known to have a bug with " "custom ssl contexts: it attempts to load system certificates " "to them. Please consider upgrading `requests` and `urllib3` " "packages. See https://github.com/urllib3/urllib3/pull/1566", UserWarning, ) def cert_verify(self, conn, url, verify, cert): super().cert_verify(conn, url, verify, cert) if self.__ssl_context is not None: # Stop requests from adding any certificates to the ssl context. conn.ca_certs = None conn.ca_cert_dir = None conn.cert_file = None conn.key_file = None