Building a Smart Python-to-R Code Converter with Gemini AI-Powered Validation and Feedback

class EnhancedPythonToRConverter:
    """
    Enhanced Python to R converter with Gemini AI validation
    """


    def __init__(self, gemini_api_key: str = None):
        self.validator = GeminiValidator(gemini_api_key)


        self.import_mappings = {
            'pandas': 'library(dplyr)\nlibrary(tidyr)\nlibrary(readr)',
            'numpy': 'library(base)',
            'matplotlib.pyplot': 'library(ggplot2)',
            'seaborn': 'library(ggplot2)\nlibrary(RColorBrewer)',
            'scipy.stats': 'library(stats)',
            'sklearn': 'library(caret)\nlibrary(randomForest)\nlibrary(e1071)',
            'statsmodels': 'library(stats)\nlibrary(lmtest)',
            'plotly': 'library(plotly)',
        }


        self.function_mappings = {
            'pd.DataFrame': 'data.frame',
            'pd.read_csv': 'read.csv',
            'pd.read_excel': 'read_excel',
            'df.head': 'head',
            'df.tail': 'tail',
            'df.shape': 'dim',
            'df.info': 'str',
            'df.describe': 'summary',
            'df.mean': 'mean',
            'df.median': 'median',
            'df.std': 'sd',
            'df.var': 'var',
            'df.sum': 'sum',
            'df.count': 'length',
            'df.groupby': 'group_by',
            'df.merge': 'merge',
            'df.drop': 'select',
            'df.dropna': 'na.omit',
            'df.fillna': 'replace_na',
            'df.sort_values': 'arrange',
            'df.value_counts': 'table',


            'np.array': 'c',
            'np.mean': 'mean',
            'np.median': 'median',
            'np.std': 'sd',
            'np.var': 'var',
            'np.sum': 'sum',
            'np.min': 'min',
            'np.max': 'max',
            'np.sqrt': 'sqrt',
            'np.log': 'log',
            'np.exp': 'exp',
            'np.random.normal': 'rnorm',
            'np.random.uniform': 'runif',
            'np.linspace': 'seq',
            'np.arange': 'seq',


            'plt.figure': 'ggplot',
            'plt.plot': 'geom_line',
            'plt.scatter': 'geom_point',
            'plt.hist': 'geom_histogram',
            'plt.bar': 'geom_bar',
            'plt.boxplot': 'geom_boxplot',
            'plt.show': 'print',
            'sns.scatterplot': 'geom_point',
            'sns.histplot': 'geom_histogram',
            'sns.boxplot': 'geom_boxplot',
            'sns.heatmap': 'geom_tile',


            'scipy.stats.ttest_ind': 't.test',
            'scipy.stats.chi2_contingency': 'chisq.test',
            'scipy.stats.pearsonr': 'cor.test',
            'scipy.stats.spearmanr': 'cor.test',
            'scipy.stats.normaltest': 'shapiro.test',
            'stats.ttest_ind': 't.test',


            'sklearn.linear_model.LinearRegression': 'lm',
            'sklearn.ensemble.RandomForestRegressor': 'randomForest',
            'sklearn.model_selection.train_test_split': 'sample',
        }


        self.syntax_patterns = [
            (r'\bTrue\b', 'TRUE'),
            (r'\bFalse\b', 'FALSE'),
            (r'\bNone\b', 'NULL'),
            (r'\blen\(', 'length('),
            (r'range\((\d+)\)', r'1:\1'),
            (r'range\((\d+),\s*(\d+)\)', r'\1:\2'),
            (r'\.split\(', '.strsplit('),
            (r'\.strip\(\)', '.str_trim()'),
            (r'\.lower\(\)', '.str_to_lower()'),
            (r'\.upper\(\)', '.str_to_upper()'),
            (r'\[0\]', '[1]'),
            (r'f"([^"]*)"', r'paste0("\1")'),
            (r"f'([^']*)'", r"paste0('\1')"),
        ]


    def convert_imports(self, code: str) -> str:
        """Convert Python import statements to R library statements."""
        lines = code.split('\n')
        converted_lines = []


        for line in lines:
            line = line.strip()
            if line.startswith('import ') or line.startswith('from '):
                if ' as ' in line:
                    if 'import' in line and 'as' in line:
                        parts = line.split(' as ')
                        module = parts[0].replace('import ', '').strip()
                        if module in self.import_mappings:
                            converted_lines.append(f"# {line}")
                            converted_lines.append(self.import_mappings[module])
                        else:
                            converted_lines.append(f"# {line} # No direct R equivalent")
                    elif 'from' in line and 'import' in line and 'as' in line:
                        converted_lines.append(f"# {line} # Handle specific imports manually")
                elif line.startswith('from '):
                    parts = line.split(' import ')
                    module = parts[0].replace('from ', '').strip()
                    if module in self.import_mappings:
                        converted_lines.append(f"# {line}")
                        converted_lines.append(self.import_mappings[module])
                    else:
                        converted_lines.append(f"# {line} # No direct R equivalent")
                else:
                    module = line.replace('import ', '').strip()
                    if module in self.import_mappings:
                        converted_lines.append(f"# {line}")
                        converted_lines.append(self.import_mappings[module])
                    else:
                        converted_lines.append(f"# {line} # No direct R equivalent")
            else:
                converted_lines.append(line)


        return '\n'.join(converted_lines)


    def convert_functions(self, code: str) -> str:
        """Convert Python function calls to R equivalents."""
        for py_func, r_func in self.function_mappings.items():
            code = code.replace(py_func, r_func)
        return code


    def apply_syntax_patterns(self, code: str) -> str:
        """Apply regex patterns to convert Python syntax to R syntax."""
        for pattern, replacement in self.syntax_patterns:
            code = re.sub(pattern, replacement, code)
        return code


    def convert_pandas_operations(self, code: str) -> str:
        """Convert common pandas operations to dplyr/tidyr equivalents."""
        code = re.sub(r'df\[[\'"](.*?)[\'"]\]', r'df$\1', code)
        code = re.sub(r'df\.(\w+)', r'df$\1', code)


        code = re.sub(r'df\[df\[[\'"](.*?)[\'"]\]\s*([> str:
        """Convert matplotlib/seaborn plotting to ggplot2."""
        conversions = [
            (r'plt\.figure\(figsize=\((\d+),\s*(\d+)\)\)', r'# Set figure size in ggplot theme'),
            (r'plt\.title\([\'"](.*?)[\'\"]\)', r'+ ggtitle("\1")'),
            (r'plt\.xlabel\([\'"](.*?)[\'\"]\)', r'+ xlab("\1")'),
            (r'plt\.ylabel\([\'"](.*?)[\'\"]\)', r'+ ylab("\1")'),
            (r'plt\.legend\(\)', r'+ theme(legend.position="right")'),
            (r'plt\.grid\(True\)', r'+ theme(panel.grid.major = element_line())'),
        ]


        for pattern, replacement in conversions:
            code = re.sub(pattern, replacement, code)


        return code


    def add_r_context(self, code: str) -> str:
        """Add R-specific context and comments."""
        r_header=""'# R Statistical Analysis Code
# Converted from Python using Enhanced Converter with Gemini AI Validation
# Install required packages: install.packages(c("dplyr", "ggplot2", "tidyr", "readr"))


'''
        return r_header + code


    def convert_code(self, python_code: str) -> str:
        """Main conversion method that applies all transformations."""
        code = python_code.strip()


        code = self.convert_imports(code)
        code = self.convert_functions(code)
        code = self.convert_pandas_operations(code)
        code = self.convert_plotting(code)
        code = self.apply_syntax_patterns(code)
        code = self.add_r_context(code)


        return code


    def convert_and_validate(self, python_code: str, use_gemini: bool = True) -> Dict:
        """
        Convert Python code to R and validate with Gemini AI
        """
        r_code = self.convert_code(python_code)


        result = {
            "original_python": python_code,
            "converted_r": r_code,
            "validation": None
        }


        if use_gemini and self.validator.api_key:
            print("🔍 Validating conversion with Gemini AI...")
            validation = self.validator.validate_conversion(python_code, r_code)
            result["validation"] = validation


            if validation.get("improved_code") and validation.get("improved_code") != r_code:
                result["final_r_code"] = validation["improved_code"]
            else:
                result["final_r_code"] = r_code
        else:
            result["final_r_code"] = r_code
            if not self.validator.api_key:
                result["validation"] = {"note": "Set GEMINI_API_KEY for AI validation"}


        return result


    def print_results(self, results: Dict):
        """Pretty print the conversion results"""
        print("=" * 80)
        print("🐍 ORIGINAL PYTHON CODE")
        print("=" * 80)
        print(results["original_python"])


        print("\n" + "=" * 80)
        print("📊 CONVERTED R CODE")
        print("=" * 80)
        print(results["final_r_code"])


        if results.get("validation"):
            validation = results["validation"]
            print("\n" + "=" * 80)
            print("🤖 GEMINI AI VALIDATION")
            print("=" * 80)


            if validation.get("validation_score"):
                print(f"📈 Score: {validation['validation_score']}/100")


            if validation.get("summary"):
                print(f"📝 Summary: {validation['summary']}")


            if validation.get("issues_found"):
                print("\n⚠️  Issues Found:")
                for issue in validation["issues_found"]:
                    print(f"   • {issue}")


            if validation.get("suggestions"):
                print("\n💡 Suggestions:")
                for suggestion in validation["suggestions"]:
                    print(f"   • {suggestion}")