initial
This commit is contained in:
242
modules/doc_extractor.py
Normal file
242
modules/doc_extractor.py
Normal file
@@ -0,0 +1,242 @@
|
||||
"""
|
||||
Module for extracting documentation from Python objects using pydoc and inspect.
|
||||
"""
|
||||
import pydoc
|
||||
import inspect
|
||||
from typing import Optional, Dict, Any
|
||||
|
||||
|
||||
class DocExtractor:
|
||||
"""
|
||||
Extracts documentation from Python objects.
|
||||
|
||||
Supports:
|
||||
- Modules
|
||||
- Classes
|
||||
- Functions
|
||||
- Methods
|
||||
- Builtins
|
||||
- Any object accessible through pydoc
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def extract_doc(object_name: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Extract documentation for a Python object.
|
||||
|
||||
Args:
|
||||
object_name: Dot-separated path to the object (e.g., 'dict.update', 'os.path', 'builtins.BaseException')
|
||||
|
||||
Returns:
|
||||
Dictionary containing:
|
||||
- 'original': Original English documentation
|
||||
- 'object_name': Name of the object
|
||||
- 'object_type': Type of object (module, class, function, etc.)
|
||||
- 'signature': Function/method signature if applicable
|
||||
- 'error': Error message if extraction failed
|
||||
"""
|
||||
try:
|
||||
obj = None
|
||||
resolved_name = object_name
|
||||
|
||||
# For builtins, resolve directly first (pydoc.resolve can be unreliable)
|
||||
if object_name.startswith('builtins.'):
|
||||
try:
|
||||
import builtins
|
||||
name = object_name.replace('builtins.', '', 1)
|
||||
if hasattr(builtins, name):
|
||||
obj = getattr(builtins, name)
|
||||
# Verify we got the right object
|
||||
obj_name = getattr(obj, '__name__', None)
|
||||
if obj_name == name:
|
||||
resolved_name = object_name
|
||||
else:
|
||||
obj = None # Wrong object, try other methods
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# If not a builtin or builtin resolution failed, try direct import first
|
||||
# This is more reliable than pydoc.resolve for standard library modules
|
||||
if obj is None:
|
||||
try:
|
||||
parts = object_name.split('.')
|
||||
if len(parts) == 1:
|
||||
# Simple module name (e.g., 'asyncio')
|
||||
obj = __import__(object_name)
|
||||
# Verify it's actually a module
|
||||
if not inspect.ismodule(obj):
|
||||
obj = None
|
||||
elif len(parts) > 1:
|
||||
# Dotted name (e.g., 'os.path', 'collections.abc')
|
||||
module_name = '.'.join(parts[:-1])
|
||||
attr_name = parts[-1]
|
||||
module = __import__(module_name, fromlist=[attr_name])
|
||||
obj = getattr(module, attr_name)
|
||||
resolved_name = object_name
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# If direct import failed, try pydoc.resolve as fallback
|
||||
if obj is None:
|
||||
try:
|
||||
resolved_obj = pydoc.resolve(object_name)
|
||||
# Verify the resolved object is correct
|
||||
obj = resolved_obj
|
||||
except (ImportError, AttributeError, ValueError) as e:
|
||||
pass
|
||||
|
||||
if obj is None:
|
||||
raise ValueError(f"Could not resolve object: {object_name}")
|
||||
|
||||
# Verify we got the right object by checking its name and type
|
||||
# This helps catch cases where pydoc.resolve returns wrong object
|
||||
try:
|
||||
parts = object_name.split('.')
|
||||
expected_name = parts[-1]
|
||||
actual_name = getattr(obj, '__name__', None) or getattr(obj, '__qualname__', None)
|
||||
|
||||
# For modules, check module name
|
||||
if inspect.ismodule(obj):
|
||||
module_name = getattr(obj, '__name__', '')
|
||||
if module_name != object_name and not module_name.endswith('.' + object_name):
|
||||
# Wrong module - try direct import
|
||||
try:
|
||||
correct_obj = __import__(object_name)
|
||||
if inspect.ismodule(correct_obj) and getattr(correct_obj, '__name__', '') == object_name:
|
||||
obj = correct_obj
|
||||
except Exception:
|
||||
pass
|
||||
# For non-modules, verify the name matches
|
||||
elif actual_name and actual_name != expected_name:
|
||||
# Object name doesn't match - try to get it more directly
|
||||
if len(parts) == 2 and parts[0] == 'builtins':
|
||||
import builtins
|
||||
if hasattr(builtins, parts[1]):
|
||||
new_obj = getattr(builtins, parts[1])
|
||||
new_name = getattr(new_obj, '__name__', None)
|
||||
if new_name == expected_name:
|
||||
obj = new_obj
|
||||
elif len(parts) > 1:
|
||||
# Try direct import for standard library
|
||||
try:
|
||||
module_name = '.'.join(parts[:-1])
|
||||
attr_name = parts[-1]
|
||||
module = __import__(module_name, fromlist=[attr_name])
|
||||
new_obj = getattr(module, attr_name)
|
||||
new_name = getattr(new_obj, '__name__', None) or getattr(new_obj, '__qualname__', None)
|
||||
if new_name == expected_name or new_name == attr_name:
|
||||
obj = new_obj
|
||||
except Exception:
|
||||
pass
|
||||
except Exception:
|
||||
pass # Continue even if verification fails
|
||||
|
||||
# Get the docstring
|
||||
docstring = inspect.getdoc(obj) or pydoc.getdoc(obj) or ""
|
||||
|
||||
# Additional verification: check if docstring matches tuple (common wrong result)
|
||||
# This catches cases where pydoc.resolve returns tuple instead of the requested object
|
||||
if docstring and "Built-in immutable sequence" in docstring and "tuple" in docstring.lower():
|
||||
# This looks like tuple documentation - verify we didn't request tuple
|
||||
if object_name.lower() != 'tuple' and not object_name.lower().endswith('.tuple'):
|
||||
# We got tuple docs but didn't ask for tuple - this is wrong!
|
||||
# Try to get the correct object
|
||||
try:
|
||||
parts = object_name.split('.')
|
||||
if len(parts) == 1:
|
||||
# Simple module - try direct import
|
||||
correct_obj = __import__(object_name)
|
||||
if inspect.ismodule(correct_obj):
|
||||
correct_doc = inspect.getdoc(correct_obj) or pydoc.getdoc(correct_obj) or ""
|
||||
# If the correct doc doesn't mention tuple, use it
|
||||
if "tuple" not in correct_doc.lower() or "Built-in immutable sequence" not in correct_doc:
|
||||
obj = correct_obj
|
||||
docstring = correct_doc
|
||||
elif len(parts) > 1:
|
||||
# Dotted name - try direct import
|
||||
module_name = '.'.join(parts[:-1])
|
||||
attr_name = parts[-1]
|
||||
module = __import__(module_name, fromlist=[attr_name])
|
||||
correct_obj = getattr(module, attr_name)
|
||||
correct_doc = inspect.getdoc(correct_obj) or pydoc.getdoc(correct_obj) or ""
|
||||
# If the correct doc doesn't mention tuple, use it
|
||||
if "tuple" not in correct_doc.lower() or "Built-in immutable sequence" not in correct_doc:
|
||||
obj = correct_obj
|
||||
docstring = correct_doc
|
||||
except Exception:
|
||||
pass # If correction fails, continue with what we have
|
||||
|
||||
# Determine object type
|
||||
if inspect.ismodule(obj):
|
||||
obj_type = "module"
|
||||
elif inspect.isclass(obj):
|
||||
obj_type = "class"
|
||||
elif inspect.isfunction(obj) or inspect.ismethod(obj):
|
||||
obj_type = "function"
|
||||
else:
|
||||
obj_type = "object"
|
||||
|
||||
# Get signature if it's a callable
|
||||
signature = None
|
||||
if inspect.isclass(obj) or inspect.isfunction(obj) or inspect.ismethod(obj):
|
||||
try:
|
||||
sig = inspect.signature(obj)
|
||||
signature = str(sig)
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
|
||||
# If docstring is empty, try to get help text
|
||||
if not docstring:
|
||||
try:
|
||||
help_text = pydoc.render_doc(obj, renderer=pydoc.plaintext)
|
||||
# Extract just the docstring part (first paragraph after object name)
|
||||
lines = help_text.split('\n')
|
||||
# Skip empty lines and find the actual docstring
|
||||
start_idx = 0
|
||||
for i, line in enumerate(lines):
|
||||
if line.strip() and not line.strip().startswith(object_name.split('.')[-1]):
|
||||
start_idx = i
|
||||
break
|
||||
docstring = '\n'.join(lines[start_idx:]).strip()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Final fallback: use help() output
|
||||
if not docstring:
|
||||
try:
|
||||
import io
|
||||
import sys
|
||||
help_output = io.StringIO()
|
||||
sys.stdout = help_output
|
||||
help(obj)
|
||||
sys.stdout = sys.__stdout__
|
||||
help_text = help_output.getvalue()
|
||||
# Extract meaningful parts
|
||||
lines = help_text.split('\n')
|
||||
docstring = '\n'.join([l for l in lines if l.strip() and not l.strip().startswith('Help on')])[:500]
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return {
|
||||
'original': docstring,
|
||||
'object_name': resolved_name, # Use resolved name, not original
|
||||
'object_type': obj_type,
|
||||
'signature': signature,
|
||||
'error': None
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
import traceback
|
||||
error_msg = str(e)
|
||||
# Don't expose full traceback to user, but log it
|
||||
print(f"Error extracting doc for {object_name}: {error_msg}")
|
||||
print(traceback.format_exc())
|
||||
|
||||
return {
|
||||
'original': None,
|
||||
'object_name': object_name,
|
||||
'object_type': None,
|
||||
'signature': None,
|
||||
'error': f"Could not extract documentation: {error_msg}"
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user