Files
ai-app-skr/services/geminiService.ts
2025-11-23 22:02:04 +08:00

562 lines
23 KiB
TypeScript

import { GoogleGenAI, Modality, Type } from "@google/genai";
import { PronunciationFeedback, Language, ReadingLesson, ReadingDifficulty, OCRAnalysis, ListeningLesson } from "../types";
import { base64ToUint8Array, uint8ArrayToBase64 } from "../utils/audioUtils";
export const USER_API_KEY_STORAGE = 'sakura_user_api_key';
export const USER_BASE_URL_STORAGE = 'sakura_user_base_url';
// Helper to decode audio for playback
// Updated to support raw PCM (typically returned by Gemini TTS) which browser cannot decode automatically
export const decodeAudioData = async (
base64Data: string,
audioContext: AudioContext
): Promise<AudioBuffer> => {
const binaryString = atob(base64Data);
const len = binaryString.length;
const bytes = new Uint8Array(len);
for (let i = 0; i < len; i++) {
bytes[i] = binaryString.charCodeAt(i);
}
try {
// Try standard decoding first (wav/mp3 containers)
// We clone the buffer because decodeAudioData detaches it
return await audioContext.decodeAudioData(bytes.buffer.slice(0));
} catch (e) {
// Fallback: Treat as raw PCM (16-bit, 24kHz default for Gemini TTS, or 16kHz)
// Assuming 24kHz Mono 16-bit Little Endian based on typical Gemini TTS raw output
const pcmData = new Int16Array(bytes.buffer);
const float32Data = new Float32Array(pcmData.length);
for (let i = 0; i < pcmData.length; i++) {
// Convert int16 to float32 (-1.0 to 1.0)
float32Data[i] = pcmData[i] / 32768.0;
}
// Create buffer: 1 channel, length, 24000 sample rate
const audioBuffer = audioContext.createBuffer(1, float32Data.length, 24000);
audioBuffer.getChannelData(0).set(float32Data);
return audioBuffer;
}
};
// Helper to check/request Veo key
export const ensureVeoKey = async (): Promise<void> => {
// @ts-ignore
if (window.aistudio) {
// @ts-ignore
const hasKey = await window.aistudio.hasSelectedApiKey();
if (!hasKey) {
// @ts-ignore
await window.aistudio.openSelectKey();
}
}
};
const LANGUAGE_MAP = {
en: "English",
ja: "Japanese",
zh: "Chinese (Simplified)"
};
class GeminiService {
private getAi() {
const userKey = localStorage.getItem(USER_API_KEY_STORAGE);
let userBaseUrl = localStorage.getItem(USER_BASE_URL_STORAGE);
const envKey = process.env.API_KEY;
const keyToUse = (userKey && userKey.trim().length > 0) ? userKey : envKey;
if (!keyToUse) {
console.error("API_KEY is missing.");
throw new Error("API Key is missing");
}
const config: any = { apiKey: keyToUse };
if (userBaseUrl && userBaseUrl.trim().length > 0) {
// Sanitize Base URL: remove quotes and trailing slashes
let cleanUrl = userBaseUrl.trim().replace(/['"]/g, '').replace(/\/+$/, '');
config.baseUrl = cleanUrl;
}
return new GoogleGenAI(config);
}
private async getApiKey(): Promise<string> {
const userKey = localStorage.getItem(USER_API_KEY_STORAGE);
const envKey = process.env.API_KEY;
const key = (userKey && userKey.trim().length > 0) ? userKey : envKey;
if (!key) throw new Error("No API Key available");
return key;
}
private async retryOperation<T>(operation: () => Promise<T>, retries = 3, delay = 1000): Promise<T> {
try {
return await operation();
} catch (error: any) {
const errorMsg = error?.message || '';
// Check for Network/CORS/Proxy errors specifically
if (errorMsg.includes('Failed to fetch') || errorMsg.includes('NetworkError')) {
console.error("Network Error Detected:", error);
throw new Error("Network connection failed. Please check your Base URL (Proxy) settings or internet connection.");
}
const isOverloaded =
error?.status === 503 ||
error?.response?.status === 503 ||
errorMsg.includes('503') ||
errorMsg.includes('overloaded');
if (isOverloaded && retries > 0) {
console.warn(`Model overloaded (503). Retrying...`);
await new Promise(resolve => setTimeout(resolve, delay));
return this.retryOperation(operation, retries - 1, delay * 2);
}
throw error;
}
}
// 1. Text Chat Response - Returns { text, model }
async generateTextResponse(
prompt: string,
imageBase64?: string,
useThinking: boolean = false,
language: Language = 'en',
modelOverride?: string,
aiSpeakingLanguage: 'ja' | 'native' = 'native'
): Promise<{ text: string, model: string }> {
const ai = this.getAi();
// Ensure model name is clean
let modelName = useThinking
? 'gemini-3-pro-preview'
: (imageBase64 ? 'gemini-3-pro-preview' : (modelOverride || 'gemini-2.5-flash'));
// Extra safety: strip quotes just in case
modelName = modelName.replace(/['"]/g, '');
const targetLangName = LANGUAGE_MAP[language];
const parts: any[] = [];
if (imageBase64) {
parts.push({
inlineData: {
mimeType: 'image/jpeg',
data: imageBase64
}
});
parts.push({ text: `Analyze this image in the context of learning Japanese. Explain in ${targetLangName}: ` + prompt });
} else {
parts.push({ text: prompt });
}
let instruction = "";
if (aiSpeakingLanguage === 'ja') {
instruction = `You are Sakura, a Japanese language tutor.
IMPORTANT:
- Respond primarily in Japanese (日本語) to help the user practice immersion.
- Only use ${targetLangName} for complex grammar explanations or if the user asks specifically for a translation.
- Keep the tone encouraging and natural.`;
} else {
instruction = `You are Sakura, a friendly, encouraging, and highly skilled Japanese language tutor. You help users learn vocabulary, grammar, listening, and speaking. You provide clear explanations, examples, and translations.
IMPORTANT:
- You are teaching Japanese.
- However, the user speaks ${targetLangName}.
- Provide your explanations, translations, and feedback in ${targetLangName}.`;
}
const config: any = {
systemInstruction: instruction,
};
if (useThinking) {
config.thinkingConfig = { thinkingBudget: 32768 };
}
return this.retryOperation(async () => {
const response = await ai.models.generateContent({
model: modelName,
contents: { parts },
config: config
});
return {
text: response.text || "I apologize, I couldn't generate a response.",
model: modelName
};
});
}
// Internal helper for single TTS chunk
private async _generateSpeechChunk(text: string): Promise<string | null> {
const ai = this.getAi();
return this.retryOperation(async () => {
try {
const response = await ai.models.generateContent({
model: 'gemini-2.5-flash-preview-tts',
contents: [{ parts: [{ text }] }],
config: {
responseModalities: [Modality.AUDIO],
speechConfig: {
voiceConfig: { prebuiltVoiceConfig: { voiceName: 'Kore' } },
},
},
});
return response.candidates?.[0]?.content?.parts?.[0]?.inlineData?.data || null;
} catch (e) {
console.error("TTS Chunk Error", e);
throw e; // Throw to retryOperation to handle network errors
}
});
}
async generateSpeech(text: string): Promise<string | null> {
if (!text || !text.trim()) return null;
const MAX_CHUNK_LENGTH = 250; // Safe limit to prevent network timeout on long generation
// If text is short, process directly
if (text.length <= MAX_CHUNK_LENGTH) {
try {
return await this._generateSpeechChunk(text);
} catch (e) {
return null;
}
}
// Split text into chunks by sentence to avoid breaking words
const regex = /[^。!?.!?\n]+[。!?.!?\n]*|[^。!?.!?\n]+$/g;
const sentences = text.match(regex) || [text];
const chunks: string[] = [];
let currentChunk = '';
for (const sentence of sentences) {
if ((currentChunk + sentence).length > MAX_CHUNK_LENGTH) {
if (currentChunk) chunks.push(currentChunk);
currentChunk = sentence;
// Force split if a single sentence exceeds max length
while (currentChunk.length > MAX_CHUNK_LENGTH) {
chunks.push(currentChunk.slice(0, MAX_CHUNK_LENGTH));
currentChunk = currentChunk.slice(MAX_CHUNK_LENGTH);
}
} else {
currentChunk += sentence;
}
}
if (currentChunk) chunks.push(currentChunk);
try {
// Generate chunks in parallel to speed up total time
// Note: Promise.all order is preserved
const results = await Promise.all(chunks.map(chunk => this._generateSpeechChunk(chunk)));
// If any chunk failed, the whole audio is compromised
if (results.some(r => r === null)) return null;
// Convert Base64 -> Uint8Array
const audioSegments = results.map(r => base64ToUint8Array(r!));
// Concatenate raw PCM data
const totalLength = audioSegments.reduce((acc, cur) => acc + cur.length, 0);
const combined = new Uint8Array(totalLength);
let offset = 0;
for (const seg of audioSegments) {
combined.set(seg, offset);
offset += seg.length;
}
// Convert back to Base64 for playback/storage
return uint8ArrayToBase64(combined);
} catch (e) {
console.error("TTS Assembly Error", e);
return null;
}
}
async transcribeAudio(audioBase64: string): Promise<string> {
const ai = this.getAi();
return this.retryOperation(async () => {
const response = await ai.models.generateContent({
model: 'gemini-2.5-flash',
contents: {
parts: [
{ inlineData: { mimeType: 'audio/wav', data: audioBase64 } },
{ text: "Transcribe accurately." },
],
},
});
return response.text || "";
});
}
async generateImage(prompt: string): Promise<string | null> {
const ai = this.getAi();
return this.retryOperation(async () => {
try {
const response = await ai.models.generateImages({
model: 'imagen-4.0-generate-001',
prompt: prompt + " style of a japanese textbook illustration",
config: { numberOfImages: 1, outputMimeType: 'image/jpeg', aspectRatio: '1:1' },
});
const bytes = response.generatedImages?.[0]?.image?.imageBytes;
return bytes ? `data:image/jpeg;base64,${bytes}` : null;
} catch (e) {
console.error("Image Gen Error", e);
throw e;
}
});
}
async editImage(base64Original: string, prompt: string): Promise<string | null> {
const ai = this.getAi();
return this.retryOperation(async () => {
try {
const cleanBase64 = base64Original.replace(/^data:image\/(png|jpeg|jpg|webp|heic|heif);base64,/i, "");
const response = await ai.models.generateContent({
model: 'gemini-2.5-flash-image',
contents: {
parts: [
{ inlineData: { data: cleanBase64, mimeType: 'image/jpeg' } },
{ text: prompt }
]
},
config: { responseModalities: [Modality.IMAGE] }
});
for (const part of response.candidates?.[0]?.content?.parts || []) {
if (part.inlineData) return `data:image/png;base64,${part.inlineData.data}`;
}
return null;
} catch (e) {
console.error("Image Edit Error", e);
throw e;
}
});
}
async generateVideo(prompt: string, onStatusUpdate: (status: string) => void): Promise<string | null> {
await ensureVeoKey();
const ai = this.getAi();
try {
onStatusUpdate("Initializing Veo...");
let operation = await ai.models.generateVideos({
model: 'veo-3.1-fast-generate-preview',
prompt: prompt,
config: { numberOfVideos: 1, resolution: '720p', aspectRatio: '16:9' }
});
onStatusUpdate("Dreaming up video...");
while (!operation.done) {
await new Promise(resolve => setTimeout(resolve, 5000));
operation = await ai.operations.getVideosOperation({ operation: operation });
}
const videoUri = operation.response?.generatedVideos?.[0]?.video?.uri;
if (!videoUri) return null;
const apiKey = await this.getApiKey();
const videoRes = await fetch(`${videoUri}&key=${apiKey}`);
const blob = await videoRes.blob();
return URL.createObjectURL(blob);
} catch (e) {
console.error("Veo Error", e);
return null;
}
}
async analyzeSpeakingPerformance(audioBase64: string, scenarioContext: string, historyContext: string, language: Language = 'en'): Promise<PronunciationFeedback | null> {
const ai = this.getAi();
const targetLangName = LANGUAGE_MAP[language];
const prompt = `Roleplay: ${scenarioContext}. History: ${historyContext}. Listen, Transcribe, Reply, Evaluate (JSON). Translation/Advice in ${targetLangName}.`;
return this.retryOperation(async () => {
const response = await ai.models.generateContent({
model: 'gemini-2.5-flash',
contents: {
parts: [{ inlineData: { mimeType: 'audio/wav', data: audioBase64 } }, { text: prompt }]
},
config: {
responseMimeType: "application/json",
responseSchema: {
type: Type.OBJECT,
properties: {
transcription: { type: Type.STRING },
response: { type: Type.STRING },
translation: { type: Type.STRING },
score: { type: Type.INTEGER },
pronunciationIssues: { type: Type.ARRAY, items: { type: Type.STRING } },
advice: { type: Type.STRING }
},
required: ["transcription", "response", "translation", "score", "pronunciationIssues", "advice"]
}
}
});
return response.text ? JSON.parse(response.text) : null;
});
}
async generateReadingLesson(topic: string, difficulty: ReadingDifficulty, language: Language): Promise<ReadingLesson | null> {
const ai = this.getAi();
const targetLangName = LANGUAGE_MAP[language];
const prompt = `Create a complete Japanese reading lesson on "${topic}", level ${difficulty}.
The 'japaneseContent' MUST be a complete article or story (at least 300 characters).
Output JSON with title, japaneseContent, translation (${targetLangName}), vocabulary, and grammarPoints (list of key grammar used in the text with explanations).`;
return this.retryOperation(async () => {
const response = await ai.models.generateContent({
model: 'gemini-2.5-flash',
contents: { parts: [{ text: prompt }] },
config: {
responseMimeType: "application/json",
responseSchema: {
type: Type.OBJECT,
properties: {
title: { type: Type.STRING },
japaneseContent: { type: Type.STRING },
translation: { type: Type.STRING },
vocabulary: { type: Type.ARRAY, items: { type: Type.OBJECT, properties: { word: { type: Type.STRING }, reading: { type: Type.STRING }, meaning: { type: Type.STRING } } } },
grammarPoints: { type: Type.ARRAY, items: { type: Type.OBJECT, properties: { point: { type: Type.STRING }, explanation: { type: Type.STRING } } } }
},
required: ["title", "japaneseContent", "translation", "vocabulary", "grammarPoints"]
}
}
});
return response.text ? JSON.parse(response.text) : null;
});
}
async generateListeningLesson(topic: string, difficulty: ReadingDifficulty, language: Language): Promise<ListeningLesson | null> {
const ai = this.getAi();
const targetLangName = LANGUAGE_MAP[language];
// Prompt asks for a conversation or monologue suitable for listening practice
const prompt = `Create a Japanese listening practice script on "${topic}", level ${difficulty}. It should be a conversation or monologue.
Output JSON with:
- title
- script (The full Japanese text of the conversation/monologue)
- translation (The full text in ${targetLangName})
- vocabulary (Key words)
- questions (3 multiple choice comprehension questions in ${targetLangName})
- Each question needs: question, options (array of 3 strings), correctIndex (0-2), explanation.
`;
return this.retryOperation(async () => {
const response = await ai.models.generateContent({
model: 'gemini-2.5-flash',
contents: { parts: [{ text: prompt }] },
config: {
responseMimeType: "application/json",
responseSchema: {
type: Type.OBJECT,
properties: {
title: { type: Type.STRING },
script: { type: Type.STRING },
translation: { type: Type.STRING },
vocabulary: { type: Type.ARRAY, items: { type: Type.OBJECT, properties: { word: { type: Type.STRING }, reading: { type: Type.STRING }, meaning: { type: Type.STRING } } } },
questions: {
type: Type.ARRAY,
items: {
type: Type.OBJECT,
properties: {
id: { type: Type.STRING },
question: { type: Type.STRING },
options: { type: Type.ARRAY, items: { type: Type.STRING } },
correctIndex: { type: Type.INTEGER },
explanation: { type: Type.STRING }
},
required: ["question", "options", "correctIndex", "explanation"]
}
}
},
required: ["title", "script", "translation", "vocabulary", "questions"]
}
}
});
return response.text ? JSON.parse(response.text) : null;
});
}
async generateReadingTutorResponse(question: string, lesson: ReadingLesson | ListeningLesson, history: string, language: Language): Promise<string> {
const ai = this.getAi();
// Handle both ReadingLesson (japaneseContent) and ListeningLesson (script)
const content = 'japaneseContent' in lesson ? lesson.japaneseContent : lesson.script;
const prompt = `Tutor for text "${lesson.title}". Question: "${question}". History: ${history}. Explain in ${LANGUAGE_MAP[language]}.`;
return this.retryOperation(async () => {
const res = await ai.models.generateContent({
model: 'gemini-2.5-flash',
contents: { parts: [{ text: prompt }] }
});
return res.text || "";
});
}
async translateText(text: string, target: string, source: string = "Auto"): Promise<string> {
const ai = this.getAi();
return this.retryOperation(async () => {
const res = await ai.models.generateContent({
model: 'gemini-2.5-flash',
contents: { parts: [{ text: `Translate the following text from ${source} to ${target}.` }, { text: text }] },
config: {
responseMimeType: "application/json",
responseSchema: {
type: Type.OBJECT,
properties: { translation: { type: Type.STRING } },
required: ["translation"]
}
}
});
return (res.text ? JSON.parse(res.text).translation : "") || "";
});
}
async translateImage(base64: string, target: string, source: string = "Auto"): Promise<{ original: string; translated: string } | null> {
const ai = this.getAi();
const cleanBase64 = base64.replace(/^data:image\/(png|jpeg|jpg|webp|heic|heif);base64,/i, "");
return this.retryOperation(async () => {
const res = await ai.models.generateContent({
model: 'gemini-2.5-flash',
contents: {
parts: [{ inlineData: { mimeType: 'image/jpeg', data: cleanBase64 } }, { text: `Extract text (Language: ${source}) and translate to ${target}. JSON output: original, translated.` }]
},
config: {
responseMimeType: "application/json",
responseSchema: {
type: Type.OBJECT,
properties: { original: { type: Type.STRING }, translated: { type: Type.STRING } },
required: ["original", "translated"]
}
}
});
return res.text ? JSON.parse(res.text) : null;
});
}
async extractAndAnalyzeText(base64: string, language: Language): Promise<OCRAnalysis | null> {
const ai = this.getAi();
const cleanBase64 = base64.replace(/^data:image\/(png|jpeg|jpg|webp|heic|heif);base64,/i, "");
const targetLang = LANGUAGE_MAP[language];
const prompt = `OCR and analyze text. Explain in ${targetLang}. JSON: extractedText, detectedLanguage, summary, vocabulary, grammarPoints.`;
return this.retryOperation(async () => {
const res = await ai.models.generateContent({
model: 'gemini-2.5-flash',
contents: {
parts: [{ inlineData: { mimeType: 'image/jpeg', data: cleanBase64 } }, { text: prompt }]
},
config: {
responseMimeType: "application/json",
responseSchema: {
type: Type.OBJECT,
properties: {
extractedText: { type: Type.STRING },
detectedLanguage: { type: Type.STRING },
summary: { type: Type.STRING },
vocabulary: { type: Type.ARRAY, items: { type: Type.OBJECT, properties: { word: { type: Type.STRING }, reading: { type: Type.STRING }, meaning: { type: Type.STRING } } } },
grammarPoints: { type: Type.ARRAY, items: { type: Type.OBJECT, properties: { point: { type: Type.STRING }, explanation: { type: Type.STRING } } } }
},
required: ["extractedText", "detectedLanguage", "summary", "vocabulary", "grammarPoints"]
}
}
});
return res.text ? JSON.parse(res.text) : null;
});
}
}
export const geminiService = new GeminiService();