This commit is contained in:
2025-11-16 18:01:30 +01:00
commit 858003cb0b
26 changed files with 4712 additions and 0 deletions

242
modules/doc_extractor.py Normal file
View File

@@ -0,0 +1,242 @@
"""
Module for extracting documentation from Python objects using pydoc and inspect.
"""
import pydoc
import inspect
from typing import Optional, Dict, Any
class DocExtractor:
"""
Extracts documentation from Python objects.
Supports:
- Modules
- Classes
- Functions
- Methods
- Builtins
- Any object accessible through pydoc
"""
@staticmethod
def extract_doc(object_name: str) -> Dict[str, Any]:
"""
Extract documentation for a Python object.
Args:
object_name: Dot-separated path to the object (e.g., 'dict.update', 'os.path', 'builtins.BaseException')
Returns:
Dictionary containing:
- 'original': Original English documentation
- 'object_name': Name of the object
- 'object_type': Type of object (module, class, function, etc.)
- 'signature': Function/method signature if applicable
- 'error': Error message if extraction failed
"""
try:
obj = None
resolved_name = object_name
# For builtins, resolve directly first (pydoc.resolve can be unreliable)
if object_name.startswith('builtins.'):
try:
import builtins
name = object_name.replace('builtins.', '', 1)
if hasattr(builtins, name):
obj = getattr(builtins, name)
# Verify we got the right object
obj_name = getattr(obj, '__name__', None)
if obj_name == name:
resolved_name = object_name
else:
obj = None # Wrong object, try other methods
except Exception:
pass
# If not a builtin or builtin resolution failed, try direct import first
# This is more reliable than pydoc.resolve for standard library modules
if obj is None:
try:
parts = object_name.split('.')
if len(parts) == 1:
# Simple module name (e.g., 'asyncio')
obj = __import__(object_name)
# Verify it's actually a module
if not inspect.ismodule(obj):
obj = None
elif len(parts) > 1:
# Dotted name (e.g., 'os.path', 'collections.abc')
module_name = '.'.join(parts[:-1])
attr_name = parts[-1]
module = __import__(module_name, fromlist=[attr_name])
obj = getattr(module, attr_name)
resolved_name = object_name
except Exception:
pass
# If direct import failed, try pydoc.resolve as fallback
if obj is None:
try:
resolved_obj = pydoc.resolve(object_name)
# Verify the resolved object is correct
obj = resolved_obj
except (ImportError, AttributeError, ValueError) as e:
pass
if obj is None:
raise ValueError(f"Could not resolve object: {object_name}")
# Verify we got the right object by checking its name and type
# This helps catch cases where pydoc.resolve returns wrong object
try:
parts = object_name.split('.')
expected_name = parts[-1]
actual_name = getattr(obj, '__name__', None) or getattr(obj, '__qualname__', None)
# For modules, check module name
if inspect.ismodule(obj):
module_name = getattr(obj, '__name__', '')
if module_name != object_name and not module_name.endswith('.' + object_name):
# Wrong module - try direct import
try:
correct_obj = __import__(object_name)
if inspect.ismodule(correct_obj) and getattr(correct_obj, '__name__', '') == object_name:
obj = correct_obj
except Exception:
pass
# For non-modules, verify the name matches
elif actual_name and actual_name != expected_name:
# Object name doesn't match - try to get it more directly
if len(parts) == 2 and parts[0] == 'builtins':
import builtins
if hasattr(builtins, parts[1]):
new_obj = getattr(builtins, parts[1])
new_name = getattr(new_obj, '__name__', None)
if new_name == expected_name:
obj = new_obj
elif len(parts) > 1:
# Try direct import for standard library
try:
module_name = '.'.join(parts[:-1])
attr_name = parts[-1]
module = __import__(module_name, fromlist=[attr_name])
new_obj = getattr(module, attr_name)
new_name = getattr(new_obj, '__name__', None) or getattr(new_obj, '__qualname__', None)
if new_name == expected_name or new_name == attr_name:
obj = new_obj
except Exception:
pass
except Exception:
pass # Continue even if verification fails
# Get the docstring
docstring = inspect.getdoc(obj) or pydoc.getdoc(obj) or ""
# Additional verification: check if docstring matches tuple (common wrong result)
# This catches cases where pydoc.resolve returns tuple instead of the requested object
if docstring and "Built-in immutable sequence" in docstring and "tuple" in docstring.lower():
# This looks like tuple documentation - verify we didn't request tuple
if object_name.lower() != 'tuple' and not object_name.lower().endswith('.tuple'):
# We got tuple docs but didn't ask for tuple - this is wrong!
# Try to get the correct object
try:
parts = object_name.split('.')
if len(parts) == 1:
# Simple module - try direct import
correct_obj = __import__(object_name)
if inspect.ismodule(correct_obj):
correct_doc = inspect.getdoc(correct_obj) or pydoc.getdoc(correct_obj) or ""
# If the correct doc doesn't mention tuple, use it
if "tuple" not in correct_doc.lower() or "Built-in immutable sequence" not in correct_doc:
obj = correct_obj
docstring = correct_doc
elif len(parts) > 1:
# Dotted name - try direct import
module_name = '.'.join(parts[:-1])
attr_name = parts[-1]
module = __import__(module_name, fromlist=[attr_name])
correct_obj = getattr(module, attr_name)
correct_doc = inspect.getdoc(correct_obj) or pydoc.getdoc(correct_obj) or ""
# If the correct doc doesn't mention tuple, use it
if "tuple" not in correct_doc.lower() or "Built-in immutable sequence" not in correct_doc:
obj = correct_obj
docstring = correct_doc
except Exception:
pass # If correction fails, continue with what we have
# Determine object type
if inspect.ismodule(obj):
obj_type = "module"
elif inspect.isclass(obj):
obj_type = "class"
elif inspect.isfunction(obj) or inspect.ismethod(obj):
obj_type = "function"
else:
obj_type = "object"
# Get signature if it's a callable
signature = None
if inspect.isclass(obj) or inspect.isfunction(obj) or inspect.ismethod(obj):
try:
sig = inspect.signature(obj)
signature = str(sig)
except (ValueError, TypeError):
pass
# If docstring is empty, try to get help text
if not docstring:
try:
help_text = pydoc.render_doc(obj, renderer=pydoc.plaintext)
# Extract just the docstring part (first paragraph after object name)
lines = help_text.split('\n')
# Skip empty lines and find the actual docstring
start_idx = 0
for i, line in enumerate(lines):
if line.strip() and not line.strip().startswith(object_name.split('.')[-1]):
start_idx = i
break
docstring = '\n'.join(lines[start_idx:]).strip()
except Exception:
pass
# Final fallback: use help() output
if not docstring:
try:
import io
import sys
help_output = io.StringIO()
sys.stdout = help_output
help(obj)
sys.stdout = sys.__stdout__
help_text = help_output.getvalue()
# Extract meaningful parts
lines = help_text.split('\n')
docstring = '\n'.join([l for l in lines if l.strip() and not l.strip().startswith('Help on')])[:500]
except Exception:
pass
return {
'original': docstring,
'object_name': resolved_name, # Use resolved name, not original
'object_type': obj_type,
'signature': signature,
'error': None
}
except Exception as e:
import traceback
error_msg = str(e)
# Don't expose full traceback to user, but log it
print(f"Error extracting doc for {object_name}: {error_msg}")
print(traceback.format_exc())
return {
'original': None,
'object_name': object_name,
'object_type': None,
'signature': None,
'error': f"Could not extract documentation: {error_msg}"
}