initial

2025-11-16 18:01:30 +01:00
commit 858003cb0b
26 changed files with 4712 additions and 0 deletions
--- a/modules/doc_extractor.py
+++ b/modules/doc_extractor.py
@@ -0,0 +1,242 @@
+"""
+Module for extracting documentation from Python objects using pydoc and inspect.
+"""
+import pydoc
+import inspect
+from typing import Optional, Dict, Any
+
+
+class DocExtractor:
+    """
+    Extracts documentation from Python objects.
+    
+    Supports:
+    - Modules
+    - Classes
+    - Functions
+    - Methods
+    - Builtins
+    - Any object accessible through pydoc
+    """
+    
+    @staticmethod
+    def extract_doc(object_name: str) -> Dict[str, Any]:
+        """
+        Extract documentation for a Python object.
+        
+        Args:
+            object_name: Dot-separated path to the object (e.g., 'dict.update', 'os.path', 'builtins.BaseException')
+        
+        Returns:
+            Dictionary containing:
+                - 'original': Original English documentation
+                - 'object_name': Name of the object
+                - 'object_type': Type of object (module, class, function, etc.)
+                - 'signature': Function/method signature if applicable
+                - 'error': Error message if extraction failed
+        """
+        try:
+            obj = None
+            resolved_name = object_name
+            
+            # For builtins, resolve directly first (pydoc.resolve can be unreliable)
+            if object_name.startswith('builtins.'):
+                try:
+                    import builtins
+                    name = object_name.replace('builtins.', '', 1)
+                    if hasattr(builtins, name):
+                        obj = getattr(builtins, name)
+                        # Verify we got the right object
+                        obj_name = getattr(obj, '__name__', None)
+                        if obj_name == name:
+                            resolved_name = object_name
+                        else:
+                            obj = None  # Wrong object, try other methods
+                except Exception:
+                    pass
+            
+            # If not a builtin or builtin resolution failed, try direct import first
+            # This is more reliable than pydoc.resolve for standard library modules
+            if obj is None:
+                try:
+                    parts = object_name.split('.')
+                    if len(parts) == 1:
+                        # Simple module name (e.g., 'asyncio')
+                        obj = __import__(object_name)
+                        # Verify it's actually a module
+                        if not inspect.ismodule(obj):
+                            obj = None
+                    elif len(parts) > 1:
+                        # Dotted name (e.g., 'os.path', 'collections.abc')
+                        module_name = '.'.join(parts[:-1])
+                        attr_name = parts[-1]
+                        module = __import__(module_name, fromlist=[attr_name])
+                        obj = getattr(module, attr_name)
+                    resolved_name = object_name
+                except Exception:
+                    pass
+            
+            # If direct import failed, try pydoc.resolve as fallback
+            if obj is None:
+                try:
+                    resolved_obj = pydoc.resolve(object_name)
+                    # Verify the resolved object is correct
+                    obj = resolved_obj
+                except (ImportError, AttributeError, ValueError) as e:
+                    pass
+            
+            if obj is None:
+                raise ValueError(f"Could not resolve object: {object_name}")
+            
+            # Verify we got the right object by checking its name and type
+            # This helps catch cases where pydoc.resolve returns wrong object
+            try:
+                parts = object_name.split('.')
+                expected_name = parts[-1]
+                actual_name = getattr(obj, '__name__', None) or getattr(obj, '__qualname__', None)
+                
+                # For modules, check module name
+                if inspect.ismodule(obj):
+                    module_name = getattr(obj, '__name__', '')
+                    if module_name != object_name and not module_name.endswith('.' + object_name):
+                        # Wrong module - try direct import
+                        try:
+                            correct_obj = __import__(object_name)
+                            if inspect.ismodule(correct_obj) and getattr(correct_obj, '__name__', '') == object_name:
+                                obj = correct_obj
+                        except Exception:
+                            pass
+                # For non-modules, verify the name matches
+                elif actual_name and actual_name != expected_name:
+                    # Object name doesn't match - try to get it more directly
+                    if len(parts) == 2 and parts[0] == 'builtins':
+                        import builtins
+                        if hasattr(builtins, parts[1]):
+                            new_obj = getattr(builtins, parts[1])
+                            new_name = getattr(new_obj, '__name__', None)
+                            if new_name == expected_name:
+                                obj = new_obj
+                    elif len(parts) > 1:
+                        # Try direct import for standard library
+                        try:
+                            module_name = '.'.join(parts[:-1])
+                            attr_name = parts[-1]
+                            module = __import__(module_name, fromlist=[attr_name])
+                            new_obj = getattr(module, attr_name)
+                            new_name = getattr(new_obj, '__name__', None) or getattr(new_obj, '__qualname__', None)
+                            if new_name == expected_name or new_name == attr_name:
+                                obj = new_obj
+                        except Exception:
+                            pass
+            except Exception:
+                pass  # Continue even if verification fails
+            
+            # Get the docstring
+            docstring = inspect.getdoc(obj) or pydoc.getdoc(obj) or ""
+            
+            # Additional verification: check if docstring matches tuple (common wrong result)
+            # This catches cases where pydoc.resolve returns tuple instead of the requested object
+            if docstring and "Built-in immutable sequence" in docstring and "tuple" in docstring.lower():
+                # This looks like tuple documentation - verify we didn't request tuple
+                if object_name.lower() != 'tuple' and not object_name.lower().endswith('.tuple'):
+                    # We got tuple docs but didn't ask for tuple - this is wrong!
+                    # Try to get the correct object
+                    try:
+                        parts = object_name.split('.')
+                        if len(parts) == 1:
+                            # Simple module - try direct import
+                            correct_obj = __import__(object_name)
+                            if inspect.ismodule(correct_obj):
+                                correct_doc = inspect.getdoc(correct_obj) or pydoc.getdoc(correct_obj) or ""
+                                # If the correct doc doesn't mention tuple, use it
+                                if "tuple" not in correct_doc.lower() or "Built-in immutable sequence" not in correct_doc:
+                                    obj = correct_obj
+                                    docstring = correct_doc
+                        elif len(parts) > 1:
+                            # Dotted name - try direct import
+                            module_name = '.'.join(parts[:-1])
+                            attr_name = parts[-1]
+                            module = __import__(module_name, fromlist=[attr_name])
+                            correct_obj = getattr(module, attr_name)
+                            correct_doc = inspect.getdoc(correct_obj) or pydoc.getdoc(correct_obj) or ""
+                            # If the correct doc doesn't mention tuple, use it
+                            if "tuple" not in correct_doc.lower() or "Built-in immutable sequence" not in correct_doc:
+                                obj = correct_obj
+                                docstring = correct_doc
+                    except Exception:
+                        pass  # If correction fails, continue with what we have
+            
+            # Determine object type
+            if inspect.ismodule(obj):
+                obj_type = "module"
+            elif inspect.isclass(obj):
+                obj_type = "class"
+            elif inspect.isfunction(obj) or inspect.ismethod(obj):
+                obj_type = "function"
+            else:
+                obj_type = "object"
+            
+            # Get signature if it's a callable
+            signature = None
+            if inspect.isclass(obj) or inspect.isfunction(obj) or inspect.ismethod(obj):
+                try:
+                    sig = inspect.signature(obj)
+                    signature = str(sig)
+                except (ValueError, TypeError):
+                    pass
+            
+            # If docstring is empty, try to get help text
+            if not docstring:
+                try:
+                    help_text = pydoc.render_doc(obj, renderer=pydoc.plaintext)
+                    # Extract just the docstring part (first paragraph after object name)
+                    lines = help_text.split('\n')
+                    # Skip empty lines and find the actual docstring
+                    start_idx = 0
+                    for i, line in enumerate(lines):
+                        if line.strip() and not line.strip().startswith(object_name.split('.')[-1]):
+                            start_idx = i
+                            break
+                    docstring = '\n'.join(lines[start_idx:]).strip()
+                except Exception:
+                    pass
+            
+            # Final fallback: use help() output
+            if not docstring:
+                try:
+                    import io
+                    import sys
+                    help_output = io.StringIO()
+                    sys.stdout = help_output
+                    help(obj)
+                    sys.stdout = sys.__stdout__
+                    help_text = help_output.getvalue()
+                    # Extract meaningful parts
+                    lines = help_text.split('\n')
+                    docstring = '\n'.join([l for l in lines if l.strip() and not l.strip().startswith('Help on')])[:500]
+                except Exception:
+                    pass
+            
+            return {
+                'original': docstring,
+                'object_name': resolved_name,  # Use resolved name, not original
+                'object_type': obj_type,
+                'signature': signature,
+                'error': None
+            }
+            
+        except Exception as e:
+            import traceback
+            error_msg = str(e)
+            # Don't expose full traceback to user, but log it
+            print(f"Error extracting doc for {object_name}: {error_msg}")
+            print(traceback.format_exc())
+            
+            return {
+                'original': None,
+                'object_name': object_name,
+                'object_type': None,
+                'signature': None,
+                'error': f"Could not extract documentation: {error_msg}"
+            }
+