""" Translation module with extensible backend support. To add a new translation provider: 1. Create a class that inherits from TranslationBackend 2. Implement the translate() method 3. Register it in the TranslationService class """ from abc import ABC, abstractmethod from typing import Optional import os class TranslationBackend(ABC): """Abstract base class for translation backends.""" @abstractmethod def translate(self, text: str, target_lang: str, source_lang: str = "en") -> Optional[str]: """ Translate text from source language to target language. Args: text: Text to translate target_lang: Target language code (e.g., 'de', 'fr', 'es') source_lang: Source language code (default: 'en') Returns: Translated text, or None if translation fails """ pass class HuggingFaceTranslator(TranslationBackend): """ Translation backend using HuggingFace transformers. Uses Helsinki-NLP models for translation. """ def __init__(self): self._model = None self._tokenizer = None self._model_name = None self._device = 'cpu' # Default device def _load_model(self, target_lang: str): """Lazy load the translation model.""" try: from transformers import MarianMTModel, MarianTokenizer import torch except ImportError as e: raise ImportError( "transformers library not installed. " "Install with: pip install transformers torch" ) from e try: # Check for SentencePiece (required by MarianTokenizer) import sentencepiece except ImportError: raise ImportError( "SentencePiece library not installed. " "Install with: pip install sentencepiece" ) # Map language codes to model names model_map = { 'de': 'Helsinki-NLP/opus-mt-en-de', 'fr': 'Helsinki-NLP/opus-mt-en-fr', 'es': 'Helsinki-NLP/opus-mt-en-es', 'it': 'Helsinki-NLP/opus-mt-en-it', 'pt': 'Helsinki-NLP/opus-mt-en-pt', 'ru': 'Helsinki-NLP/opus-mt-en-ru', } model_name = model_map.get(target_lang) if not model_name: raise ValueError(f"No model available for language: {target_lang}") # Only reload if language changed if self._model_name != model_name: device = 'cuda' if torch.cuda.is_available() else 'cpu' # Load tokenizer first (doesn't need device) self._tokenizer = MarianTokenizer.from_pretrained(model_name) # Load model - try to load directly to device to avoid meta tensor issues try: # For CPU, load normally if device == 'cpu': self._model = MarianMTModel.from_pretrained(model_name) self._model.eval() else: # For CUDA, try loading with device_map or load then move try: # Try loading with device_map if supported self._model = MarianMTModel.from_pretrained( model_name, device_map='auto' ) self._model.eval() # Update device based on where model actually ended up actual_device = next(self._model.parameters()).device.type device = actual_device if actual_device in ['cuda', 'cpu'] else 'cpu' except (TypeError, ValueError): # Fallback: load to CPU first, then move self._model = MarianMTModel.from_pretrained(model_name) self._model.eval() try: self._model = self._model.to(device) except Exception: # If moving fails, keep on CPU device = 'cpu' except Exception as e: # Ultimate fallback: load to CPU print(f"Warning: Error loading model to {device}, using CPU: {e}") self._model = MarianMTModel.from_pretrained(model_name) self._model.eval() device = 'cpu' self._model_name = model_name self._device = device def translate(self, text: str, target_lang: str, source_lang: str = "en") -> Optional[str]: """Translate using HuggingFace model.""" if not text: return "" try: self._load_model(target_lang) import torch # Split text into paragraphs first, then sentences paragraphs = text.split('\n\n') translated_paragraphs = [] for para in paragraphs: if not para.strip(): translated_paragraphs.append(para) continue # Split into sentences (simple approach) sentences = para.split('\n') translated_sentences = [] for sentence in sentences: if not sentence.strip(): translated_sentences.append(sentence) continue try: # Tokenize and move to device inputs = self._tokenizer( [sentence], return_tensors="pt", padding=True, truncation=True, max_length=512 ).to(self._device) # Generate translation with torch.no_grad(): translated = self._model.generate(**inputs, max_length=512) translated_text = self._tokenizer.decode(translated[0], skip_special_tokens=True) translated_sentences.append(translated_text) except Exception as e: print(f"Error translating sentence: {e}") translated_sentences.append(sentence) # Fallback to original translated_paragraphs.append('\n'.join(translated_sentences)) return '\n\n'.join(translated_paragraphs) except Exception as e: import traceback print(f"Translation error: {e}") print(traceback.format_exc()) return None class GoogleTranslateBackend(TranslationBackend): """ Translation backend using Google Translate API. Requires GOOGLE_TRANSLATE_API_KEY environment variable. """ def __init__(self): self.api_key = os.getenv('GOOGLE_TRANSLATE_API_KEY') if not self.api_key: raise ValueError("GOOGLE_TRANSLATE_API_KEY environment variable not set") def translate(self, text: str, target_lang: str, source_lang: str = "en") -> Optional[str]: """Translate using Google Translate API.""" try: from googletrans import Translator translator = Translator() result = translator.translate(text, dest=target_lang, src=source_lang) return result.text except ImportError: raise ImportError("googletrans library not installed. Install with: pip install googletrans==4.0.0rc1") except Exception as e: print(f"Google Translate error: {e}") return None class DeepLTranslator(TranslationBackend): """ Translation backend using DeepL API. Requires DEEPL_API_KEY environment variable. """ def __init__(self): self.api_key = os.getenv('DEEPL_API_KEY') if not self.api_key: raise ValueError("DEEPL_API_KEY environment variable not set") def translate(self, text: str, target_lang: str, source_lang: str = "en") -> Optional[str]: """Translate using DeepL API.""" try: import deepl translator = deepl.Translator(self.api_key) result = translator.translate_text(text, target_lang=target_lang.upper(), source_lang=source_lang.upper()) return result.text except ImportError: raise ImportError("deepl library not installed. Install with: pip install deepl") except Exception as e: print(f"DeepL translation error: {e}") return None class TranslationService: """ Translation service that manages multiple translation backends. Automatically selects the best available backend based on configuration. """ def __init__(self, backend: Optional[str] = None): """ Initialize translation service. Args: backend: Backend name ('huggingface', 'google', 'deepl'). If None, auto-selects based on availability. """ self.backend_name = backend or self._auto_select_backend() self.backend = self._create_backend(self.backend_name) def _auto_select_backend(self) -> str: """Auto-select the best available backend.""" # Priority: DeepL > Google > HuggingFace if os.getenv('DEEPL_API_KEY'): return 'deepl' elif os.getenv('GOOGLE_TRANSLATE_API_KEY'): return 'google' else: return 'huggingface' # Default to local model def _create_backend(self, backend_name: str) -> TranslationBackend: """Create a translation backend instance.""" backends = { 'huggingface': HuggingFaceTranslator, 'google': GoogleTranslateBackend, 'deepl': DeepLTranslator, } backend_class = backends.get(backend_name.lower()) if not backend_class: raise ValueError(f"Unknown backend: {backend_name}") try: return backend_class() except Exception as e: # Fallback to HuggingFace if other backends fail if backend_name != 'huggingface': print(f"Failed to initialize {backend_name}, falling back to HuggingFace: {e}") return HuggingFaceTranslator() raise def translate(self, text: str, target_lang: str, source_lang: str = "en") -> Optional[str]: """Translate text using the configured backend.""" if not text: return "" return self.backend.translate(text, target_lang, source_lang)