ai-app-skr/services/geminiService.ts


import { GoogleGenAI, Modality, Type } from "@google/genai";
import { PronunciationFeedback, Language, ReadingLesson, ReadingDifficulty, OCRAnalysis, ListeningLesson } from "../types";
import { base64ToUint8Array, uint8ArrayToBase64 } from "../utils/audioUtils";

export const USER_API_KEY_STORAGE = 'sakura_user_api_key';
export const USER_BASE_URL_STORAGE = 'sakura_user_base_url';

// Helper to decode audio for playback
// Updated to support raw PCM (typically returned by Gemini TTS) which browser cannot decode automatically
export const decodeAudioData = async (
  base64Data: string,
  audioContext: AudioContext
): Promise<AudioBuffer> => {
  const binaryString = atob(base64Data);
  const len = binaryString.length;
  const bytes = new Uint8Array(len);
  for (let i = 0; i < len; i++) {
    bytes[i] = binaryString.charCodeAt(i);
  }

  try {
    // Try standard decoding first (wav/mp3 containers)
    // We clone the buffer because decodeAudioData detaches it
    return await audioContext.decodeAudioData(bytes.buffer.slice(0));
  } catch (e) {
    // Fallback: Treat as raw PCM (16-bit, 24kHz default for Gemini TTS, or 16kHz)
    // Assuming 24kHz Mono 16-bit Little Endian based on typical Gemini TTS raw output
    const pcmData = new Int16Array(bytes.buffer);
    const float32Data = new Float32Array(pcmData.length);
    for (let i = 0; i < pcmData.length; i++) {
        // Convert int16 to float32 (-1.0 to 1.0)
        float32Data[i] = pcmData[i] / 32768.0;
    }

    // Create buffer: 1 channel, length, 24000 sample rate
    const audioBuffer = audioContext.createBuffer(1, float32Data.length, 24000);
    audioBuffer.getChannelData(0).set(float32Data);
    return audioBuffer;
  }
};

// Helper to check/request Veo key
export const ensureVeoKey = async (): Promise<void> => {
  // @ts-ignore
  if (window.aistudio) {
    // @ts-ignore
    const hasKey = await window.aistudio.hasSelectedApiKey();
    if (!hasKey) {
        // @ts-ignore
      await window.aistudio.openSelectKey();
    }
  }
};

const LANGUAGE_MAP = {
  en: "English",
  ja: "Japanese",
  zh: "Chinese (Simplified)"
};

class GeminiService {
  private getAi() {
    const userKey = localStorage.getItem(USER_API_KEY_STORAGE);
    let userBaseUrl = localStorage.getItem(USER_BASE_URL_STORAGE);
    const envKey = process.env.API_KEY;
    const keyToUse = (userKey && userKey.trim().length > 0) ? userKey : envKey;

    if (!keyToUse) {
      console.error("API_KEY is missing.");
      throw new Error("API Key is missing");
    }

    const config: any = { apiKey: keyToUse };

    if (userBaseUrl && userBaseUrl.trim().length > 0) {
        // Sanitize Base URL: remove quotes and trailing slashes
        let cleanUrl = userBaseUrl.trim().replace(/['"]/g, '').replace(/\/+$/, '');
        config.baseUrl = cleanUrl;
    }

    return new GoogleGenAI(config);
  }

  private async getApiKey(): Promise<string> {
      const userKey = localStorage.getItem(USER_API_KEY_STORAGE);
      const envKey = process.env.API_KEY;
      const key = (userKey && userKey.trim().length > 0) ? userKey : envKey;
      if (!key) throw new Error("No API Key available");
      return key;
  }

  private async retryOperation<T>(operation: () => Promise<T>, retries = 3, delay = 1000): Promise<T> {
    try {
      return await operation();
    } catch (error: any) {
      const errorMsg = error?.message || '';

      // Check for Network/CORS/Proxy errors specifically
      if (errorMsg.includes('Failed to fetch') || errorMsg.includes('NetworkError')) {
          console.error("Network Error Detected:", error);
          throw new Error("Network connection failed. Please check your Base URL (Proxy) settings or internet connection.");
      }

      const isOverloaded =
        error?.status === 503 ||
        error?.response?.status === 503 ||
        errorMsg.includes('503') ||
        errorMsg.includes('overloaded');

      if (isOverloaded && retries > 0) {
        console.warn(`Model overloaded (503). Retrying...`);
        await new Promise(resolve => setTimeout(resolve, delay));
        return this.retryOperation(operation, retries - 1, delay * 2);
      }
      throw error;
    }
  }

  // 1. Text Chat Response - Returns { text, model }
  async generateTextResponse(
    prompt: string,
    imageBase64?: string,
    useThinking: boolean = false,
    language: Language = 'en',
    modelOverride?: string,
    aiSpeakingLanguage: 'ja' | 'native' = 'native'
  ): Promise<{ text: string, model: string }> {
    const ai = this.getAi();

    // Ensure model name is clean
    let modelName = useThinking
        ? 'gemini-3-pro-preview'
        : (imageBase64 ? 'gemini-3-pro-preview' : (modelOverride || 'gemini-2.5-flash'));

    // Extra safety: strip quotes just in case
    modelName = modelName.replace(/['"]/g, '');

    const targetLangName = LANGUAGE_MAP[language];
    const parts: any[] = [];

    if (imageBase64) {
      parts.push({
        inlineData: {
          mimeType: 'image/jpeg',
          data: imageBase64
        }
      });
      parts.push({ text: `Analyze this image in the context of learning Japanese. Explain in ${targetLangName}: ` + prompt });
    } else {
      parts.push({ text: prompt });
    }

    let instruction = "";
    if (aiSpeakingLanguage === 'ja') {
        instruction = `You are Sakura, a Japanese language tutor.
        IMPORTANT:
        - Respond primarily in Japanese (日本語) to help the user practice immersion.
        - Only use ${targetLangName} for complex grammar explanations or if the user asks specifically for a translation.
        - Keep the tone encouraging and natural.`;
    } else {
        instruction = `You are Sakura, a friendly, encouraging, and highly skilled Japanese language tutor. You help users learn vocabulary, grammar, listening, and speaking. You provide clear explanations, examples, and translations.
        IMPORTANT:
        - You are teaching Japanese.
        - However, the user speaks ${targetLangName}.
        - Provide your explanations, translations, and feedback in ${targetLangName}.`;
    }

    const config: any = {
        systemInstruction: instruction,
    };

    if (useThinking) {
      config.thinkingConfig = { thinkingBudget: 32768 };
    }

    return this.retryOperation(async () => {
        const response = await ai.models.generateContent({
            model: modelName,
            contents: { parts },
            config: config
        });
        return {
            text: response.text || "I apologize, I couldn't generate a response.",
            model: modelName
        };
    });
  }

  // Internal helper for single TTS chunk
  private async _generateSpeechChunk(text: string): Promise<string | null> {
    const ai = this.getAi();
    return this.retryOperation(async () => {
        try {
            const response = await ai.models.generateContent({
                model: 'gemini-2.5-flash-preview-tts',
                contents: [{ parts: [{ text }] }],
                config: {
                    responseModalities: [Modality.AUDIO],
                    speechConfig: {
                        voiceConfig: { prebuiltVoiceConfig: { voiceName: 'Kore' } },
                    },
                },
            });
            return response.candidates?.[0]?.content?.parts?.[0]?.inlineData?.data || null;
        } catch (e) {
            console.error("TTS Chunk Error", e);
            throw e; // Throw to retryOperation to handle network errors
        }
    });
  }

  async generateSpeech(text: string): Promise<string | null> {
    if (!text || !text.trim()) return null;

    const MAX_CHUNK_LENGTH = 250; // Safe limit to prevent network timeout on long generation

    // If text is short, process directly
    if (text.length <= MAX_CHUNK_LENGTH) {
        try {
            return await this._generateSpeechChunk(text);
        } catch (e) {
            return null;
        }
    }

    // Split text into chunks by sentence to avoid breaking words
    const regex = /[^。！？.!?\n]+[。！？.!?\n]*|[^。！？.!?\n]+$/g;
    const sentences = text.match(regex) || [text];
    const chunks: string[] = [];
    let currentChunk = '';

    for (const sentence of sentences) {
        if ((currentChunk + sentence).length > MAX_CHUNK_LENGTH) {
            if (currentChunk) chunks.push(currentChunk);
            currentChunk = sentence;
            // Force split if a single sentence exceeds max length
            while (currentChunk.length > MAX_CHUNK_LENGTH) {
                chunks.push(currentChunk.slice(0, MAX_CHUNK_LENGTH));
                currentChunk = currentChunk.slice(MAX_CHUNK_LENGTH);
            }
        } else {
            currentChunk += sentence;
        }
    }
    if (currentChunk) chunks.push(currentChunk);

    try {
        // Generate chunks in parallel to speed up total time
        // Note: Promise.all order is preserved
        const results = await Promise.all(chunks.map(chunk => this._generateSpeechChunk(chunk)));

        // If any chunk failed, the whole audio is compromised
        if (results.some(r => r === null)) return null;

        // Convert Base64 -> Uint8Array
        const audioSegments = results.map(r => base64ToUint8Array(r!));

        // Concatenate raw PCM data
        const totalLength = audioSegments.reduce((acc, cur) => acc + cur.length, 0);
        const combined = new Uint8Array(totalLength);
        let offset = 0;
        for (const seg of audioSegments) {
            combined.set(seg, offset);
            offset += seg.length;
        }

        // Convert back to Base64 for playback/storage
        return uint8ArrayToBase64(combined);
    } catch (e) {
        console.error("TTS Assembly Error", e);
        return null;
    }
  }

  async transcribeAudio(audioBase64: string): Promise<string> {
    const ai = this.getAi();
    return this.retryOperation(async () => {
        const response = await ai.models.generateContent({
            model: 'gemini-2.5-flash',
            contents: {
            parts: [
                { inlineData: { mimeType: 'audio/wav', data: audioBase64 } },
                { text: "Transcribe accurately." },
            ],
            },
        });
        return response.text || "";
    });
  }

  async generateImage(prompt: string): Promise<string | null> {
    const ai = this.getAi();
    return this.retryOperation(async () => {
        try {
            const response = await ai.models.generateImages({
                model: 'imagen-4.0-generate-001',
                prompt: prompt + " style of a japanese textbook illustration",
                config: { numberOfImages: 1, outputMimeType: 'image/jpeg', aspectRatio: '1:1' },
            });
            const bytes = response.generatedImages?.[0]?.image?.imageBytes;
            return bytes ? `data:image/jpeg;base64,${bytes}` : null;
        } catch (e) {
            console.error("Image Gen Error", e);
            throw e;
        }
    });
  }

  async editImage(base64Original: string, prompt: string): Promise<string | null> {
    const ai = this.getAi();
    return this.retryOperation(async () => {
        try {
            const cleanBase64 = base64Original.replace(/^data:image\/(png|jpeg|jpg|webp|heic|heif);base64,/i, "");
            const response = await ai.models.generateContent({
                model: 'gemini-2.5-flash-image',
                contents: {
                    parts: [
                        { inlineData: { data: cleanBase64, mimeType: 'image/jpeg' } },
                        { text: prompt }
                    ]
                },
                config: { responseModalities: [Modality.IMAGE] }
            });
            for (const part of response.candidates?.[0]?.content?.parts || []) {
                if (part.inlineData) return `data:image/png;base64,${part.inlineData.data}`;
            }
            return null;
        } catch (e) {
            console.error("Image Edit Error", e);
            throw e;
        }
    });
  }

  async generateVideo(prompt: string, onStatusUpdate: (status: string) => void): Promise<string | null> {
    await ensureVeoKey();
    const ai = this.getAi();
    try {
        onStatusUpdate("Initializing Veo...");
        let operation = await ai.models.generateVideos({
            model: 'veo-3.1-fast-generate-preview',
            prompt: prompt,
            config: { numberOfVideos: 1, resolution: '720p', aspectRatio: '16:9' }
        });
        onStatusUpdate("Dreaming up video...");
        while (!operation.done) {
            await new Promise(resolve => setTimeout(resolve, 5000));
            operation = await ai.operations.getVideosOperation({ operation: operation });
        }
        const videoUri = operation.response?.generatedVideos?.[0]?.video?.uri;
        if (!videoUri) return null;
        const apiKey = await this.getApiKey();
        const videoRes = await fetch(`${videoUri}&key=${apiKey}`);
        const blob = await videoRes.blob();
        return URL.createObjectURL(blob);
    } catch (e) {
        console.error("Veo Error", e);
        return null;
    }
  }

  async analyzeSpeakingPerformance(audioBase64: string, scenarioContext: string, historyContext: string, language: Language = 'en'): Promise<PronunciationFeedback | null> {
      const ai = this.getAi();
      const targetLangName = LANGUAGE_MAP[language];
      const prompt = `Roleplay: ${scenarioContext}. History: ${historyContext}. Listen, Transcribe, Reply, Evaluate (JSON). Translation/Advice in ${targetLangName}.`;

      return this.retryOperation(async () => {
          const response = await ai.models.generateContent({
              model: 'gemini-2.5-flash',
              contents: {
                parts: [{ inlineData: { mimeType: 'audio/wav', data: audioBase64 } }, { text: prompt }]
              },
              config: {
                responseMimeType: "application/json",
                responseSchema: {
                    type: Type.OBJECT,
                    properties: {
                        transcription: { type: Type.STRING },
                        response: { type: Type.STRING },
                        translation: { type: Type.STRING },
                        score: { type: Type.INTEGER },
                        pronunciationIssues: { type: Type.ARRAY, items: { type: Type.STRING } },
                        advice: { type: Type.STRING }
                    },
                    required: ["transcription", "response", "translation", "score", "pronunciationIssues", "advice"]
                }
              }
          });
          return response.text ? JSON.parse(response.text) : null;
      });
  }

  async generateReadingLesson(topic: string, difficulty: ReadingDifficulty, language: Language): Promise<ReadingLesson | null> {
      const ai = this.getAi();
      const targetLangName = LANGUAGE_MAP[language];
      const prompt = `Create a complete Japanese reading lesson on "${topic}", level ${difficulty}.
      The 'japaneseContent' MUST be a complete article or story (at least 300 characters).
      Output JSON with title, japaneseContent, translation (${targetLangName}), vocabulary, and grammarPoints (list of key grammar used in the text with explanations).`;

      return this.retryOperation(async () => {
          const response = await ai.models.generateContent({
              model: 'gemini-2.5-flash',
              contents: { parts: [{ text: prompt }] },
              config: {
                  responseMimeType: "application/json",
                  responseSchema: {
                      type: Type.OBJECT,
                      properties: {
                          title: { type: Type.STRING },
                          japaneseContent: { type: Type.STRING },
                          translation: { type: Type.STRING },
                          vocabulary: { type: Type.ARRAY, items: { type: Type.OBJECT, properties: { word: { type: Type.STRING }, reading: { type: Type.STRING }, meaning: { type: Type.STRING } } } },
                          grammarPoints: { type: Type.ARRAY, items: { type: Type.OBJECT, properties: { point: { type: Type.STRING }, explanation: { type: Type.STRING } } } }
                      },
                      required: ["title", "japaneseContent", "translation", "vocabulary", "grammarPoints"]
                  }
              }
          });
          return response.text ? JSON.parse(response.text) : null;
      });
  }

  async generateListeningLesson(topic: string, difficulty: ReadingDifficulty, language: Language): Promise<ListeningLesson | null> {
    const ai = this.getAi();
    const targetLangName = LANGUAGE_MAP[language];
    // Prompt asks for a conversation or monologue suitable for listening practice
    const prompt = `Create a Japanese listening practice script on "${topic}", level ${difficulty}. It should be a conversation or monologue.
    Output JSON with:
    - title
    - script (The full Japanese text of the conversation/monologue)
    - translation (The full text in ${targetLangName})
    - vocabulary (Key words)
    - questions (3 multiple choice comprehension questions in ${targetLangName})
      - Each question needs: question, options (array of 3 strings), correctIndex (0-2), explanation.
    `;

    return this.retryOperation(async () => {
        const response = await ai.models.generateContent({
            model: 'gemini-2.5-flash',
            contents: { parts: [{ text: prompt }] },
            config: {
                responseMimeType: "application/json",
                responseSchema: {
                    type: Type.OBJECT,
                    properties: {
                        title: { type: Type.STRING },
                        script: { type: Type.STRING },
                        translation: { type: Type.STRING },
                        vocabulary: { type: Type.ARRAY, items: { type: Type.OBJECT, properties: { word: { type: Type.STRING }, reading: { type: Type.STRING }, meaning: { type: Type.STRING } } } },
                        questions: {
                          type: Type.ARRAY,
                          items: {
                            type: Type.OBJECT,
                            properties: {
                              id: { type: Type.STRING },
                              question: { type: Type.STRING },
                              options: { type: Type.ARRAY, items: { type: Type.STRING } },
                              correctIndex: { type: Type.INTEGER },
                              explanation: { type: Type.STRING }
                            },
                            required: ["question", "options", "correctIndex", "explanation"]
                          }
                        }
                    },
                    required: ["title", "script", "translation", "vocabulary", "questions"]
                }
            }
        });
        return response.text ? JSON.parse(response.text) : null;
    });
  }

  async generateReadingTutorResponse(question: string, lesson: ReadingLesson | ListeningLesson, history: string, language: Language): Promise<string> {
      const ai = this.getAi();
      // Handle both ReadingLesson (japaneseContent) and ListeningLesson (script)
      const content = 'japaneseContent' in lesson ? lesson.japaneseContent : lesson.script;
      const prompt = `Tutor for text "${lesson.title}". Question: "${question}". History: ${history}. Explain in ${LANGUAGE_MAP[language]}.`;
      return this.retryOperation(async () => {
          const res = await ai.models.generateContent({
              model: 'gemini-2.5-flash',
              contents: { parts: [{ text: prompt }] }
          });
          return res.text || "";
      });
  }

  async translateText(text: string, target: string, source: string = "Auto"): Promise<string> {
      const ai = this.getAi();
      return this.retryOperation(async () => {
          const res = await ai.models.generateContent({
              model: 'gemini-2.5-flash',
              contents: { parts: [{ text: `Translate the following text from ${source} to ${target}.` }, { text: text }] },
              config: {
                  responseMimeType: "application/json",
                  responseSchema: {
                      type: Type.OBJECT,
                      properties: { translation: { type: Type.STRING } },
                      required: ["translation"]
                  }
              }
          });
          return (res.text ? JSON.parse(res.text).translation : "") || "";
      });
  }

  async translateImage(base64: string, target: string, source: string = "Auto"): Promise<{ original: string; translated: string } | null> {
      const ai = this.getAi();
      const cleanBase64 = base64.replace(/^data:image\/(png|jpeg|jpg|webp|heic|heif);base64,/i, "");
      return this.retryOperation(async () => {
          const res = await ai.models.generateContent({
              model: 'gemini-2.5-flash',
              contents: {
                  parts: [{ inlineData: { mimeType: 'image/jpeg', data: cleanBase64 } }, { text: `Extract text (Language: ${source}) and translate to ${target}. JSON output: original, translated.` }]
              },
              config: {
                  responseMimeType: "application/json",
                  responseSchema: {
                      type: Type.OBJECT,
                      properties: { original: { type: Type.STRING }, translated: { type: Type.STRING } },
                      required: ["original", "translated"]
                  }
              }
          });
          return res.text ? JSON.parse(res.text) : null;
      });
  }

  async extractAndAnalyzeText(base64: string, language: Language): Promise<OCRAnalysis | null> {
      const ai = this.getAi();
      const cleanBase64 = base64.replace(/^data:image\/(png|jpeg|jpg|webp|heic|heif);base64,/i, "");
      const targetLang = LANGUAGE_MAP[language];
      const prompt = `OCR and analyze text. Explain in ${targetLang}. JSON: extractedText, detectedLanguage, summary, vocabulary, grammarPoints.`;

      return this.retryOperation(async () => {
          const res = await ai.models.generateContent({
              model: 'gemini-2.5-flash',
              contents: {
                  parts: [{ inlineData: { mimeType: 'image/jpeg', data: cleanBase64 } }, { text: prompt }]
              },
              config: {
                  responseMimeType: "application/json",
                  responseSchema: {
                      type: Type.OBJECT,
                      properties: {
                          extractedText: { type: Type.STRING },
                          detectedLanguage: { type: Type.STRING },
                          summary: { type: Type.STRING },
                          vocabulary: { type: Type.ARRAY, items: { type: Type.OBJECT, properties: { word: { type: Type.STRING }, reading: { type: Type.STRING }, meaning: { type: Type.STRING } } } },
                          grammarPoints: { type: Type.ARRAY, items: { type: Type.OBJECT, properties: { point: { type: Type.STRING }, explanation: { type: Type.STRING } } } }
                      },
                      required: ["extractedText", "detectedLanguage", "summary", "vocabulary", "grammarPoints"]
                  }
              }
          });
          return res.text ? JSON.parse(res.text) : null;
      });
  }
}

export const geminiService = new GeminiService();