372 lines
14 KiB
Python
372 lines
14 KiB
Python
|
"""Parsing/inferring signatures from documentation.
|
||
|
|
||
|
This module provides several functions to generate better stubs using
|
||
|
docstrings and Sphinx docs (.rst files).
|
||
|
"""
|
||
|
import re
|
||
|
import io
|
||
|
import contextlib
|
||
|
import tokenize
|
||
|
|
||
|
from typing import (
|
||
|
Optional, MutableMapping, MutableSequence, List, Sequence, Tuple, NamedTuple, Any
|
||
|
)
|
||
|
from typing_extensions import Final
|
||
|
|
||
|
# Type alias for signatures strings in format ('func_name', '(arg, opt_arg=False)').
|
||
|
Sig = Tuple[str, str]
|
||
|
|
||
|
|
||
|
_TYPE_RE: Final = re.compile(r"^[a-zA-Z_][\w\[\], ]*(\.[a-zA-Z_][\w\[\], ]*)*$")
|
||
|
_ARG_NAME_RE: Final = re.compile(r"\**[A-Za-z_][A-Za-z0-9_]*$")
|
||
|
|
||
|
|
||
|
def is_valid_type(s: str) -> bool:
|
||
|
"""Try to determine whether a string might be a valid type annotation."""
|
||
|
if s in ('True', 'False', 'retval'):
|
||
|
return False
|
||
|
if ',' in s and '[' not in s:
|
||
|
return False
|
||
|
return _TYPE_RE.match(s) is not None
|
||
|
|
||
|
|
||
|
class ArgSig:
|
||
|
"""Signature info for a single argument."""
|
||
|
|
||
|
def __init__(self, name: str, type: Optional[str] = None, default: bool = False):
|
||
|
self.name = name
|
||
|
if type and not is_valid_type(type):
|
||
|
raise ValueError("Invalid type: " + type)
|
||
|
self.type = type
|
||
|
# Does this argument have a default value?
|
||
|
self.default = default
|
||
|
|
||
|
def __repr__(self) -> str:
|
||
|
return "ArgSig(name={}, type={}, default={})".format(repr(self.name), repr(self.type),
|
||
|
repr(self.default))
|
||
|
|
||
|
def __eq__(self, other: Any) -> bool:
|
||
|
if isinstance(other, ArgSig):
|
||
|
return (self.name == other.name and self.type == other.type and
|
||
|
self.default == other.default)
|
||
|
return False
|
||
|
|
||
|
|
||
|
class FunctionSig(NamedTuple):
|
||
|
name: str
|
||
|
args: List[ArgSig]
|
||
|
ret_type: str
|
||
|
|
||
|
|
||
|
# States of the docstring parser.
|
||
|
STATE_INIT: Final = 1
|
||
|
STATE_FUNCTION_NAME: Final = 2
|
||
|
STATE_ARGUMENT_LIST: Final = 3
|
||
|
STATE_ARGUMENT_TYPE: Final = 4
|
||
|
STATE_ARGUMENT_DEFAULT: Final = 5
|
||
|
STATE_RETURN_VALUE: Final = 6
|
||
|
STATE_OPEN_BRACKET: Final = 7 # For generic types.
|
||
|
|
||
|
|
||
|
class DocStringParser:
|
||
|
"""Parse function signatures in documentation."""
|
||
|
|
||
|
def __init__(self, function_name: str) -> None:
|
||
|
# Only search for signatures of function with this name.
|
||
|
self.function_name = function_name
|
||
|
self.state = [STATE_INIT]
|
||
|
self.accumulator = ""
|
||
|
self.arg_type: Optional[str] = None
|
||
|
self.arg_name = ""
|
||
|
self.arg_default: Optional[str] = None
|
||
|
self.ret_type = "Any"
|
||
|
self.found = False
|
||
|
self.args: List[ArgSig] = []
|
||
|
# Valid signatures found so far.
|
||
|
self.signatures: List[FunctionSig] = []
|
||
|
|
||
|
def add_token(self, token: tokenize.TokenInfo) -> None:
|
||
|
"""Process next token from the token stream."""
|
||
|
if (token.type == tokenize.NAME and token.string == self.function_name and
|
||
|
self.state[-1] == STATE_INIT):
|
||
|
self.state.append(STATE_FUNCTION_NAME)
|
||
|
|
||
|
elif (token.type == tokenize.OP and token.string == '(' and
|
||
|
self.state[-1] == STATE_FUNCTION_NAME):
|
||
|
self.state.pop()
|
||
|
self.accumulator = ""
|
||
|
self.found = True
|
||
|
self.state.append(STATE_ARGUMENT_LIST)
|
||
|
|
||
|
elif self.state[-1] == STATE_FUNCTION_NAME:
|
||
|
# Reset state, function name not followed by '('.
|
||
|
self.state.pop()
|
||
|
|
||
|
elif (token.type == tokenize.OP and token.string in ('[', '(', '{') and
|
||
|
self.state[-1] != STATE_INIT):
|
||
|
self.accumulator += token.string
|
||
|
self.state.append(STATE_OPEN_BRACKET)
|
||
|
|
||
|
elif (token.type == tokenize.OP and token.string in (']', ')', '}') and
|
||
|
self.state[-1] == STATE_OPEN_BRACKET):
|
||
|
self.accumulator += token.string
|
||
|
self.state.pop()
|
||
|
|
||
|
elif (token.type == tokenize.OP and token.string == ':' and
|
||
|
self.state[-1] == STATE_ARGUMENT_LIST):
|
||
|
self.arg_name = self.accumulator
|
||
|
self.accumulator = ""
|
||
|
self.state.append(STATE_ARGUMENT_TYPE)
|
||
|
|
||
|
elif (token.type == tokenize.OP and token.string == '=' and
|
||
|
self.state[-1] in (STATE_ARGUMENT_LIST, STATE_ARGUMENT_TYPE)):
|
||
|
if self.state[-1] == STATE_ARGUMENT_TYPE:
|
||
|
self.arg_type = self.accumulator
|
||
|
self.state.pop()
|
||
|
else:
|
||
|
self.arg_name = self.accumulator
|
||
|
self.accumulator = ""
|
||
|
self.state.append(STATE_ARGUMENT_DEFAULT)
|
||
|
|
||
|
elif (token.type == tokenize.OP and token.string in (',', ')') and
|
||
|
self.state[-1] in (STATE_ARGUMENT_LIST, STATE_ARGUMENT_DEFAULT,
|
||
|
STATE_ARGUMENT_TYPE)):
|
||
|
if self.state[-1] == STATE_ARGUMENT_DEFAULT:
|
||
|
self.arg_default = self.accumulator
|
||
|
self.state.pop()
|
||
|
elif self.state[-1] == STATE_ARGUMENT_TYPE:
|
||
|
self.arg_type = self.accumulator
|
||
|
self.state.pop()
|
||
|
elif self.state[-1] == STATE_ARGUMENT_LIST:
|
||
|
self.arg_name = self.accumulator
|
||
|
if not (token.string == ')' and self.accumulator.strip() == '') \
|
||
|
and not _ARG_NAME_RE.match(self.arg_name):
|
||
|
# Invalid argument name.
|
||
|
self.reset()
|
||
|
return
|
||
|
|
||
|
if token.string == ')':
|
||
|
self.state.pop()
|
||
|
|
||
|
# arg_name is empty when there are no args. e.g. func()
|
||
|
if self.arg_name:
|
||
|
try:
|
||
|
self.args.append(ArgSig(name=self.arg_name, type=self.arg_type,
|
||
|
default=bool(self.arg_default)))
|
||
|
except ValueError:
|
||
|
# wrong type, use Any
|
||
|
self.args.append(ArgSig(name=self.arg_name, type=None,
|
||
|
default=bool(self.arg_default)))
|
||
|
self.arg_name = ""
|
||
|
self.arg_type = None
|
||
|
self.arg_default = None
|
||
|
self.accumulator = ""
|
||
|
|
||
|
elif token.type == tokenize.OP and token.string == '->' and self.state[-1] == STATE_INIT:
|
||
|
self.accumulator = ""
|
||
|
self.state.append(STATE_RETURN_VALUE)
|
||
|
|
||
|
# ENDMAKER is necessary for python 3.4 and 3.5.
|
||
|
elif (token.type in (tokenize.NEWLINE, tokenize.ENDMARKER) and
|
||
|
self.state[-1] in (STATE_INIT, STATE_RETURN_VALUE)):
|
||
|
if self.state[-1] == STATE_RETURN_VALUE:
|
||
|
if not is_valid_type(self.accumulator):
|
||
|
self.reset()
|
||
|
return
|
||
|
self.ret_type = self.accumulator
|
||
|
self.accumulator = ""
|
||
|
self.state.pop()
|
||
|
|
||
|
if self.found:
|
||
|
self.signatures.append(FunctionSig(name=self.function_name, args=self.args,
|
||
|
ret_type=self.ret_type))
|
||
|
self.found = False
|
||
|
self.args = []
|
||
|
self.ret_type = 'Any'
|
||
|
# Leave state as INIT.
|
||
|
else:
|
||
|
self.accumulator += token.string
|
||
|
|
||
|
def reset(self) -> None:
|
||
|
self.state = [STATE_INIT]
|
||
|
self.args = []
|
||
|
self.found = False
|
||
|
self.accumulator = ""
|
||
|
|
||
|
def get_signatures(self) -> List[FunctionSig]:
|
||
|
"""Return sorted copy of the list of signatures found so far."""
|
||
|
def has_arg(name: str, signature: FunctionSig) -> bool:
|
||
|
return any(x.name == name for x in signature.args)
|
||
|
|
||
|
def args_kwargs(signature: FunctionSig) -> bool:
|
||
|
return has_arg('*args', signature) and has_arg('**kwargs', signature)
|
||
|
|
||
|
# Move functions with (*args, **kwargs) in their signature to last place.
|
||
|
return list(sorted(self.signatures, key=lambda x: 1 if args_kwargs(x) else 0))
|
||
|
|
||
|
|
||
|
def infer_sig_from_docstring(docstr: Optional[str], name: str) -> Optional[List[FunctionSig]]:
|
||
|
"""Convert function signature to list of TypedFunctionSig
|
||
|
|
||
|
Look for function signatures of function in docstring. Signature is a string of
|
||
|
the format <function_name>(<signature>) -> <return type> or perhaps without
|
||
|
the return type.
|
||
|
|
||
|
Returns empty list, when no signature is found, one signature in typical case,
|
||
|
multiple signatures, if docstring specifies multiple signatures for overload functions.
|
||
|
Return None if the docstring is empty.
|
||
|
|
||
|
Arguments:
|
||
|
* docstr: docstring
|
||
|
* name: name of function for which signatures are to be found
|
||
|
"""
|
||
|
if not docstr:
|
||
|
return None
|
||
|
|
||
|
state = DocStringParser(name)
|
||
|
# Return all found signatures, even if there is a parse error after some are found.
|
||
|
with contextlib.suppress(tokenize.TokenError):
|
||
|
try:
|
||
|
tokens = tokenize.tokenize(io.BytesIO(docstr.encode('utf-8')).readline)
|
||
|
for token in tokens:
|
||
|
state.add_token(token)
|
||
|
except IndentationError:
|
||
|
return None
|
||
|
sigs = state.get_signatures()
|
||
|
|
||
|
def is_unique_args(sig: FunctionSig) -> bool:
|
||
|
"""return true if function argument names are unique"""
|
||
|
return len(sig.args) == len({arg.name for arg in sig.args})
|
||
|
|
||
|
# Return only signatures that have unique argument names. Mypy fails on non-unique arg names.
|
||
|
return [sig for sig in sigs if is_unique_args(sig)]
|
||
|
|
||
|
|
||
|
def infer_arg_sig_from_anon_docstring(docstr: str) -> List[ArgSig]:
|
||
|
"""Convert signature in form of "(self: TestClass, arg0: str='ada')" to List[TypedArgList]."""
|
||
|
ret = infer_sig_from_docstring("stub" + docstr, "stub")
|
||
|
if ret:
|
||
|
return ret[0].args
|
||
|
return []
|
||
|
|
||
|
|
||
|
def infer_ret_type_sig_from_docstring(docstr: str, name: str) -> Optional[str]:
|
||
|
"""Convert signature in form of "func(self: TestClass, arg0) -> int" to their return type."""
|
||
|
ret = infer_sig_from_docstring(docstr, name)
|
||
|
if ret:
|
||
|
return ret[0].ret_type
|
||
|
return None
|
||
|
|
||
|
|
||
|
def infer_ret_type_sig_from_anon_docstring(docstr: str) -> Optional[str]:
|
||
|
"""Convert signature in form of "(self: TestClass, arg0) -> int" to their return type."""
|
||
|
return infer_ret_type_sig_from_docstring("stub" + docstr.strip(), "stub")
|
||
|
|
||
|
|
||
|
def parse_signature(sig: str) -> Optional[Tuple[str,
|
||
|
List[str],
|
||
|
List[str]]]:
|
||
|
"""Split function signature into its name, positional an optional arguments.
|
||
|
|
||
|
The expected format is "func_name(arg, opt_arg=False)". Return the name of function
|
||
|
and lists of positional and optional argument names.
|
||
|
"""
|
||
|
m = re.match(r'([.a-zA-Z0-9_]+)\(([^)]*)\)', sig)
|
||
|
if not m:
|
||
|
return None
|
||
|
name = m.group(1)
|
||
|
name = name.split('.')[-1]
|
||
|
arg_string = m.group(2)
|
||
|
if not arg_string.strip():
|
||
|
# Simple case -- no arguments.
|
||
|
return name, [], []
|
||
|
|
||
|
args = [arg.strip() for arg in arg_string.split(',')]
|
||
|
positional = []
|
||
|
optional = []
|
||
|
i = 0
|
||
|
while i < len(args):
|
||
|
# Accept optional arguments as in both formats: x=None and [x].
|
||
|
if args[i].startswith('[') or '=' in args[i]:
|
||
|
break
|
||
|
positional.append(args[i].rstrip('['))
|
||
|
i += 1
|
||
|
if args[i - 1].endswith('['):
|
||
|
break
|
||
|
while i < len(args):
|
||
|
arg = args[i]
|
||
|
arg = arg.strip('[]')
|
||
|
arg = arg.split('=')[0]
|
||
|
optional.append(arg)
|
||
|
i += 1
|
||
|
return name, positional, optional
|
||
|
|
||
|
|
||
|
def build_signature(positional: Sequence[str],
|
||
|
optional: Sequence[str]) -> str:
|
||
|
"""Build function signature from lists of positional and optional argument names."""
|
||
|
args: MutableSequence[str] = []
|
||
|
args.extend(positional)
|
||
|
for arg in optional:
|
||
|
if arg.startswith('*'):
|
||
|
args.append(arg)
|
||
|
else:
|
||
|
args.append(f'{arg}=...')
|
||
|
sig = f"({', '.join(args)})"
|
||
|
# Ad-hoc fixes.
|
||
|
sig = sig.replace('(self)', '')
|
||
|
return sig
|
||
|
|
||
|
|
||
|
def parse_all_signatures(lines: Sequence[str]) -> Tuple[List[Sig],
|
||
|
List[Sig]]:
|
||
|
"""Parse all signatures in a given reST document.
|
||
|
|
||
|
Return lists of found signatures for functions and classes.
|
||
|
"""
|
||
|
sigs = []
|
||
|
class_sigs = []
|
||
|
for line in lines:
|
||
|
line = line.strip()
|
||
|
m = re.match(r'\.\. *(function|method|class) *:: *[a-zA-Z_]', line)
|
||
|
if m:
|
||
|
sig = line.split('::')[1].strip()
|
||
|
parsed = parse_signature(sig)
|
||
|
if parsed:
|
||
|
name, fixed, optional = parsed
|
||
|
if m.group(1) != 'class':
|
||
|
sigs.append((name, build_signature(fixed, optional)))
|
||
|
else:
|
||
|
class_sigs.append((name, build_signature(fixed, optional)))
|
||
|
|
||
|
return sorted(sigs), sorted(class_sigs)
|
||
|
|
||
|
|
||
|
def find_unique_signatures(sigs: Sequence[Sig]) -> List[Sig]:
|
||
|
"""Remove names with duplicate found signatures."""
|
||
|
sig_map: MutableMapping[str, List[str]] = {}
|
||
|
for name, sig in sigs:
|
||
|
sig_map.setdefault(name, []).append(sig)
|
||
|
|
||
|
result = []
|
||
|
for name, name_sigs in sig_map.items():
|
||
|
if len(set(name_sigs)) == 1:
|
||
|
result.append((name, name_sigs[0]))
|
||
|
return sorted(result)
|
||
|
|
||
|
|
||
|
def infer_prop_type_from_docstring(docstr: Optional[str]) -> Optional[str]:
|
||
|
"""Check for Google/Numpy style docstring type annotation for a property.
|
||
|
|
||
|
The docstring has the format "<type>: <descriptions>".
|
||
|
In the type string, we allow the following characters:
|
||
|
* dot: because sometimes classes are annotated using full path
|
||
|
* brackets: to allow type hints like List[int]
|
||
|
* comma/space: things like Tuple[int, int]
|
||
|
"""
|
||
|
if not docstr:
|
||
|
return None
|
||
|
test_str = r'^([a-zA-Z0-9_, \.\[\]]*): '
|
||
|
m = re.match(test_str, docstr)
|
||
|
return m.group(1) if m else None
|