import { GoogleGenAI, Modality, Type } from "@google/genai"; import { PronunciationFeedback, Language, ReadingLesson, ReadingDifficulty, OCRAnalysis, ListeningLesson } from "../types"; import { base64ToUint8Array, uint8ArrayToBase64 } from "../utils/audioUtils"; export const USER_API_KEY_STORAGE = 'sakura_user_api_key'; export const USER_BASE_URL_STORAGE = 'sakura_user_base_url'; // Helper to decode audio for playback // Updated to support raw PCM (typically returned by Gemini TTS) which browser cannot decode automatically export const decodeAudioData = async ( base64Data: string, audioContext: AudioContext ): Promise => { const binaryString = atob(base64Data); const len = binaryString.length; const bytes = new Uint8Array(len); for (let i = 0; i < len; i++) { bytes[i] = binaryString.charCodeAt(i); } try { // Try standard decoding first (wav/mp3 containers) // We clone the buffer because decodeAudioData detaches it return await audioContext.decodeAudioData(bytes.buffer.slice(0)); } catch (e) { // Fallback: Treat as raw PCM (16-bit, 24kHz default for Gemini TTS, or 16kHz) // Assuming 24kHz Mono 16-bit Little Endian based on typical Gemini TTS raw output const pcmData = new Int16Array(bytes.buffer); const float32Data = new Float32Array(pcmData.length); for (let i = 0; i < pcmData.length; i++) { // Convert int16 to float32 (-1.0 to 1.0) float32Data[i] = pcmData[i] / 32768.0; } // Create buffer: 1 channel, length, 24000 sample rate const audioBuffer = audioContext.createBuffer(1, float32Data.length, 24000); audioBuffer.getChannelData(0).set(float32Data); return audioBuffer; } }; // Helper to check/request Veo key export const ensureVeoKey = async (): Promise => { // @ts-ignore if (window.aistudio) { // @ts-ignore const hasKey = await window.aistudio.hasSelectedApiKey(); if (!hasKey) { // @ts-ignore await window.aistudio.openSelectKey(); } } }; const LANGUAGE_MAP = { en: "English", ja: "Japanese", zh: "Chinese (Simplified)" }; class GeminiService { private getAi() { const userKey = localStorage.getItem(USER_API_KEY_STORAGE); let userBaseUrl = localStorage.getItem(USER_BASE_URL_STORAGE); const envKey = process.env.API_KEY; const keyToUse = (userKey && userKey.trim().length > 0) ? userKey : envKey; if (!keyToUse) { console.error("API_KEY is missing."); throw new Error("API Key is missing"); } const config: any = { apiKey: keyToUse }; if (userBaseUrl && userBaseUrl.trim().length > 0) { // Sanitize Base URL: remove quotes and trailing slashes let cleanUrl = userBaseUrl.trim().replace(/['"]/g, '').replace(/\/+$/, ''); config.baseUrl = cleanUrl; } return new GoogleGenAI(config); } private async getApiKey(): Promise { const userKey = localStorage.getItem(USER_API_KEY_STORAGE); const envKey = process.env.API_KEY; const key = (userKey && userKey.trim().length > 0) ? userKey : envKey; if (!key) throw new Error("No API Key available"); return key; } private async retryOperation(operation: () => Promise, retries = 3, delay = 1000): Promise { try { return await operation(); } catch (error: any) { const errorMsg = error?.message || ''; // Check for Network/CORS/Proxy errors specifically if (errorMsg.includes('Failed to fetch') || errorMsg.includes('NetworkError')) { console.error("Network Error Detected:", error); throw new Error("Network connection failed. Please check your Base URL (Proxy) settings or internet connection."); } const isOverloaded = error?.status === 503 || error?.response?.status === 503 || errorMsg.includes('503') || errorMsg.includes('overloaded'); if (isOverloaded && retries > 0) { console.warn(`Model overloaded (503). Retrying...`); await new Promise(resolve => setTimeout(resolve, delay)); return this.retryOperation(operation, retries - 1, delay * 2); } throw error; } } // 1. Text Chat Response - Returns { text, model } async generateTextResponse( prompt: string, imageBase64?: string, useThinking: boolean = false, language: Language = 'en', modelOverride?: string, aiSpeakingLanguage: 'ja' | 'native' = 'native' ): Promise<{ text: string, model: string }> { const ai = this.getAi(); // Ensure model name is clean let modelName = useThinking ? 'gemini-3-pro-preview' : (imageBase64 ? 'gemini-3-pro-preview' : (modelOverride || 'gemini-2.5-flash')); // Extra safety: strip quotes just in case modelName = modelName.replace(/['"]/g, ''); const targetLangName = LANGUAGE_MAP[language]; const parts: any[] = []; if (imageBase64) { parts.push({ inlineData: { mimeType: 'image/jpeg', data: imageBase64 } }); parts.push({ text: `Analyze this image in the context of learning Japanese. Explain in ${targetLangName}: ` + prompt }); } else { parts.push({ text: prompt }); } let instruction = ""; if (aiSpeakingLanguage === 'ja') { instruction = `You are Sakura, a Japanese language tutor. IMPORTANT: - Respond primarily in Japanese (日本語) to help the user practice immersion. - Only use ${targetLangName} for complex grammar explanations or if the user asks specifically for a translation. - Keep the tone encouraging and natural.`; } else { instruction = `You are Sakura, a friendly, encouraging, and highly skilled Japanese language tutor. You help users learn vocabulary, grammar, listening, and speaking. You provide clear explanations, examples, and translations. IMPORTANT: - You are teaching Japanese. - However, the user speaks ${targetLangName}. - Provide your explanations, translations, and feedback in ${targetLangName}.`; } const config: any = { systemInstruction: instruction, }; if (useThinking) { config.thinkingConfig = { thinkingBudget: 32768 }; } return this.retryOperation(async () => { const response = await ai.models.generateContent({ model: modelName, contents: { parts }, config: config }); return { text: response.text || "I apologize, I couldn't generate a response.", model: modelName }; }); } // Internal helper for single TTS chunk private async _generateSpeechChunk(text: string): Promise { const ai = this.getAi(); return this.retryOperation(async () => { try { const response = await ai.models.generateContent({ model: 'gemini-2.5-flash-preview-tts', contents: [{ parts: [{ text }] }], config: { responseModalities: [Modality.AUDIO], speechConfig: { voiceConfig: { prebuiltVoiceConfig: { voiceName: 'Kore' } }, }, }, }); return response.candidates?.[0]?.content?.parts?.[0]?.inlineData?.data || null; } catch (e) { console.error("TTS Chunk Error", e); throw e; // Throw to retryOperation to handle network errors } }); } async generateSpeech(text: string): Promise { if (!text || !text.trim()) return null; const MAX_CHUNK_LENGTH = 250; // Safe limit to prevent network timeout on long generation // If text is short, process directly if (text.length <= MAX_CHUNK_LENGTH) { try { return await this._generateSpeechChunk(text); } catch (e) { return null; } } // Split text into chunks by sentence to avoid breaking words const regex = /[^。!?.!?\n]+[。!?.!?\n]*|[^。!?.!?\n]+$/g; const sentences = text.match(regex) || [text]; const chunks: string[] = []; let currentChunk = ''; for (const sentence of sentences) { if ((currentChunk + sentence).length > MAX_CHUNK_LENGTH) { if (currentChunk) chunks.push(currentChunk); currentChunk = sentence; // Force split if a single sentence exceeds max length while (currentChunk.length > MAX_CHUNK_LENGTH) { chunks.push(currentChunk.slice(0, MAX_CHUNK_LENGTH)); currentChunk = currentChunk.slice(MAX_CHUNK_LENGTH); } } else { currentChunk += sentence; } } if (currentChunk) chunks.push(currentChunk); try { // Generate chunks in parallel to speed up total time // Note: Promise.all order is preserved const results = await Promise.all(chunks.map(chunk => this._generateSpeechChunk(chunk))); // If any chunk failed, the whole audio is compromised if (results.some(r => r === null)) return null; // Convert Base64 -> Uint8Array const audioSegments = results.map(r => base64ToUint8Array(r!)); // Concatenate raw PCM data const totalLength = audioSegments.reduce((acc, cur) => acc + cur.length, 0); const combined = new Uint8Array(totalLength); let offset = 0; for (const seg of audioSegments) { combined.set(seg, offset); offset += seg.length; } // Convert back to Base64 for playback/storage return uint8ArrayToBase64(combined); } catch (e) { console.error("TTS Assembly Error", e); return null; } } async transcribeAudio(audioBase64: string): Promise { const ai = this.getAi(); return this.retryOperation(async () => { const response = await ai.models.generateContent({ model: 'gemini-2.5-flash', contents: { parts: [ { inlineData: { mimeType: 'audio/wav', data: audioBase64 } }, { text: "Transcribe accurately." }, ], }, }); return response.text || ""; }); } async generateImage(prompt: string): Promise { const ai = this.getAi(); return this.retryOperation(async () => { try { const response = await ai.models.generateImages({ model: 'imagen-4.0-generate-001', prompt: prompt + " style of a japanese textbook illustration", config: { numberOfImages: 1, outputMimeType: 'image/jpeg', aspectRatio: '1:1' }, }); const bytes = response.generatedImages?.[0]?.image?.imageBytes; return bytes ? `data:image/jpeg;base64,${bytes}` : null; } catch (e) { console.error("Image Gen Error", e); throw e; } }); } async editImage(base64Original: string, prompt: string): Promise { const ai = this.getAi(); return this.retryOperation(async () => { try { const cleanBase64 = base64Original.replace(/^data:image\/(png|jpeg|jpg|webp|heic|heif);base64,/i, ""); const response = await ai.models.generateContent({ model: 'gemini-2.5-flash-image', contents: { parts: [ { inlineData: { data: cleanBase64, mimeType: 'image/jpeg' } }, { text: prompt } ] }, config: { responseModalities: [Modality.IMAGE] } }); for (const part of response.candidates?.[0]?.content?.parts || []) { if (part.inlineData) return `data:image/png;base64,${part.inlineData.data}`; } return null; } catch (e) { console.error("Image Edit Error", e); throw e; } }); } async generateVideo(prompt: string, onStatusUpdate: (status: string) => void): Promise { await ensureVeoKey(); const ai = this.getAi(); try { onStatusUpdate("Initializing Veo..."); let operation = await ai.models.generateVideos({ model: 'veo-3.1-fast-generate-preview', prompt: prompt, config: { numberOfVideos: 1, resolution: '720p', aspectRatio: '16:9' } }); onStatusUpdate("Dreaming up video..."); while (!operation.done) { await new Promise(resolve => setTimeout(resolve, 5000)); operation = await ai.operations.getVideosOperation({ operation: operation }); } const videoUri = operation.response?.generatedVideos?.[0]?.video?.uri; if (!videoUri) return null; const apiKey = await this.getApiKey(); const videoRes = await fetch(`${videoUri}&key=${apiKey}`); const blob = await videoRes.blob(); return URL.createObjectURL(blob); } catch (e) { console.error("Veo Error", e); return null; } } async analyzeSpeakingPerformance(audioBase64: string, scenarioContext: string, historyContext: string, language: Language = 'en'): Promise { const ai = this.getAi(); const targetLangName = LANGUAGE_MAP[language]; const prompt = `Roleplay: ${scenarioContext}. History: ${historyContext}. Listen, Transcribe, Reply, Evaluate (JSON). Translation/Advice in ${targetLangName}.`; return this.retryOperation(async () => { const response = await ai.models.generateContent({ model: 'gemini-2.5-flash', contents: { parts: [{ inlineData: { mimeType: 'audio/wav', data: audioBase64 } }, { text: prompt }] }, config: { responseMimeType: "application/json", responseSchema: { type: Type.OBJECT, properties: { transcription: { type: Type.STRING }, response: { type: Type.STRING }, translation: { type: Type.STRING }, score: { type: Type.INTEGER }, pronunciationIssues: { type: Type.ARRAY, items: { type: Type.STRING } }, advice: { type: Type.STRING } }, required: ["transcription", "response", "translation", "score", "pronunciationIssues", "advice"] } } }); return response.text ? JSON.parse(response.text) : null; }); } async generateReadingLesson(topic: string, difficulty: ReadingDifficulty, language: Language): Promise { const ai = this.getAi(); const targetLangName = LANGUAGE_MAP[language]; const prompt = `Create a complete Japanese reading lesson on "${topic}", level ${difficulty}. The 'japaneseContent' MUST be a complete article or story (at least 300 characters). Output JSON with title, japaneseContent, translation (in ${targetLangName}), vocabulary (meanings in ${targetLangName}), and grammarPoints (explanations in ${targetLangName}).`; return this.retryOperation(async () => { const response = await ai.models.generateContent({ model: 'gemini-2.5-flash', contents: { parts: [{ text: prompt }] }, config: { responseMimeType: "application/json", responseSchema: { type: Type.OBJECT, properties: { title: { type: Type.STRING }, japaneseContent: { type: Type.STRING }, translation: { type: Type.STRING }, vocabulary: { type: Type.ARRAY, items: { type: Type.OBJECT, properties: { word: { type: Type.STRING }, reading: { type: Type.STRING }, meaning: { type: Type.STRING } } } }, grammarPoints: { type: Type.ARRAY, items: { type: Type.OBJECT, properties: { point: { type: Type.STRING }, explanation: { type: Type.STRING } } } } }, required: ["title", "japaneseContent", "translation", "vocabulary", "grammarPoints"] } } }); return response.text ? JSON.parse(response.text) : null; }); } async generateListeningLesson(topic: string, difficulty: ReadingDifficulty, language: Language): Promise { const ai = this.getAi(); const targetLangName = LANGUAGE_MAP[language]; // Prompt asks for a conversation or monologue suitable for listening practice const prompt = `Create a Japanese listening practice script on "${topic}", level ${difficulty}. Output JSON with: - title - script (The full Japanese text of the conversation/monologue) - translation (The full text in ${targetLangName}) - vocabulary (Key words with meanings in ${targetLangName}) - questions (3 multiple choice comprehension questions in ${targetLangName}) - Each question needs: question, options (array of 3 strings), correctIndex (0-2), explanation (in ${targetLangName}). - grammarPoints (explanations in ${targetLangName}). `; return this.retryOperation(async () => { const response = await ai.models.generateContent({ model: 'gemini-2.5-flash', contents: { parts: [{ text: prompt }] }, config: { responseMimeType: "application/json", responseSchema: { type: Type.OBJECT, properties: { title: { type: Type.STRING }, script: { type: Type.STRING }, translation: { type: Type.STRING }, vocabulary: { type: Type.ARRAY, items: { type: Type.OBJECT, properties: { word: { type: Type.STRING }, reading: { type: Type.STRING }, meaning: { type: Type.STRING } } } }, questions: { type: Type.ARRAY, items: { type: Type.OBJECT, properties: { id: { type: Type.STRING }, question: { type: Type.STRING }, options: { type: Type.ARRAY, items: { type: Type.STRING } }, correctIndex: { type: Type.INTEGER }, explanation: { type: Type.STRING } }, required: ["question", "options", "correctIndex", "explanation"] } }, grammarPoints: { type: Type.ARRAY, items: { type: Type.OBJECT, properties: { point: { type: Type.STRING }, explanation: { type: Type.STRING } } } } }, required: ["title", "script", "translation", "vocabulary", "questions", "grammarPoints"] } } }); return response.text ? JSON.parse(response.text) : null; }); } async generateReadingTutorResponse(question: string, lesson: ReadingLesson | ListeningLesson, history: string, language: Language): Promise { const ai = this.getAi(); // Handle both ReadingLesson (japaneseContent) and ListeningLesson (script) const content = 'japaneseContent' in lesson ? lesson.japaneseContent : lesson.script; const prompt = `Tutor for text "${lesson.title}". Question: "${question}". History: ${history}. Explain in ${LANGUAGE_MAP[language]}.`; return this.retryOperation(async () => { const res = await ai.models.generateContent({ model: 'gemini-2.5-flash', contents: { parts: [{ text: prompt }] } }); return res.text || ""; }); } async translateText(text: string, target: string, source: string = "Auto"): Promise { const ai = this.getAi(); return this.retryOperation(async () => { const res = await ai.models.generateContent({ model: 'gemini-2.5-flash', contents: { parts: [{ text: `Translate the following text from ${source} to ${target}.` }, { text: text }] }, config: { responseMimeType: "application/json", responseSchema: { type: Type.OBJECT, properties: { translation: { type: Type.STRING } }, required: ["translation"] } } }); return (res.text ? JSON.parse(res.text).translation : "") || ""; }); } async translateImage(base64: string, target: string, source: string = "Auto"): Promise<{ original: string; translated: string } | null> { const ai = this.getAi(); const cleanBase64 = base64.replace(/^data:image\/(png|jpeg|jpg|webp|heic|heif);base64,/i, ""); return this.retryOperation(async () => { const res = await ai.models.generateContent({ model: 'gemini-2.5-flash', contents: { parts: [{ inlineData: { mimeType: 'image/jpeg', data: cleanBase64 } }, { text: `Extract text (Language: ${source}) and translate to ${target}. JSON output: original, translated.` }] }, config: { responseMimeType: "application/json", responseSchema: { type: Type.OBJECT, properties: { original: { type: Type.STRING }, translated: { type: Type.STRING } }, required: ["original", "translated"] } } }); return res.text ? JSON.parse(res.text) : null; }); } async extractAndAnalyzeText(base64: string, language: Language): Promise { const ai = this.getAi(); const cleanBase64 = base64.replace(/^data:image\/(png|jpeg|jpg|webp|heic|heif);base64,/i, ""); const targetLang = LANGUAGE_MAP[language]; const prompt = `OCR and analyze text. Explain in ${targetLang}. JSON: extractedText, detectedLanguage, summary (in ${targetLang}), vocabulary (meanings in ${targetLang}), grammarPoints (explanations in ${targetLang}).`; return this.retryOperation(async () => { const res = await ai.models.generateContent({ model: 'gemini-2.5-flash', contents: { parts: [{ inlineData: { mimeType: 'image/jpeg', data: cleanBase64 } }, { text: prompt }] }, config: { responseMimeType: "application/json", responseSchema: { type: Type.OBJECT, properties: { extractedText: { type: Type.STRING }, detectedLanguage: { type: Type.STRING }, summary: { type: Type.STRING }, vocabulary: { type: Type.ARRAY, items: { type: Type.OBJECT, properties: { word: { type: Type.STRING }, reading: { type: Type.STRING }, meaning: { type: Type.STRING } } } }, grammarPoints: { type: Type.ARRAY, items: { type: Type.OBJECT, properties: { point: { type: Type.STRING }, explanation: { type: Type.STRING } } } } }, required: ["extractedText", "detectedLanguage", "summary", "vocabulary", "grammarPoints"] } } }); return res.text ? JSON.parse(res.text) : null; }); } } export const geminiService = new GeminiService();