Files
Canto/qwen3-tts-frontend/src/components/tts/VoiceCloneForm.tsx

434 lines
16 KiB
TypeScript

import { useForm, Controller } from 'react-hook-form'
import { zodResolver } from '@hookform/resolvers/zod'
import * as z from 'zod'
import { useEffect, useState, useMemo } from 'react'
import { useTranslation } from 'react-i18next'
import { Button } from '@/components/ui/button'
import { Input } from '@/components/ui/input'
import { Textarea } from '@/components/ui/textarea'
import { Select, SelectContent, SelectItem, SelectTrigger, SelectValue } from '@/components/ui/select'
import { Dialog, DialogContent, DialogDescription, DialogHeader, DialogTitle, DialogTrigger, DialogFooter } from '@/components/ui/dialog'
import { Checkbox } from '@/components/ui/checkbox'
import { Label } from '@/components/ui/label'
import { Settings, Globe2, Type, Play, FileText, Mic, ArrowRight, ArrowLeft } from 'lucide-react'
import { toast } from 'sonner'
import { IconLabel } from '@/components/IconLabel'
import { Tooltip, TooltipContent, TooltipProvider, TooltipTrigger } from '@/components/ui/tooltip'
import { ttsApi, jobApi } from '@/lib/api'
import { useJobPolling } from '@/hooks/useJobPolling'
import { useHistoryContext } from '@/contexts/HistoryContext'
import { LoadingState } from '@/components/LoadingState'
import { AudioPlayer } from '@/components/AudioPlayer'
import { FileUploader } from '@/components/FileUploader'
import { AudioRecorder } from '@/components/AudioRecorder'
import { PresetSelector } from '@/components/PresetSelector'
import type { Language } from '@/types/tts'
import { Tabs, TabsContent, TabsList, TabsTrigger } from '@/components/ui/tabs'
type FormData = {
text: string
language?: string
ref_audio: File
ref_text?: string
use_cache?: boolean
x_vector_only_mode?: boolean
max_new_tokens?: number
temperature?: number
top_k?: number
top_p?: number
repetition_penalty?: number
}
function VoiceCloneForm() {
const { t } = useTranslation('tts')
const { t: tCommon } = useTranslation('common')
const { t: tVoice } = useTranslation('voice')
const { t: tErrors } = useTranslation('errors')
const { t: tConstants } = useTranslation('constants')
const PRESET_REF_TEXTS = useMemo(() => tConstants('presetRefTexts', { returnObjects: true }) as Array<{ label: string; text: string }>, [tConstants])
const formSchema = z.object({
text: z.string().min(1, tErrors('validation.required', { field: tErrors('fieldNames.text') })).max(1000, tErrors('validation.maxLength', { field: tErrors('fieldNames.text'), max: 1000 })),
language: z.string().optional(),
ref_audio: z.instanceof(File, { message: tErrors('validation.required', { field: tErrors('fieldNames.reference_audio') }) }),
ref_text: z.string().optional(),
use_cache: z.boolean().optional(),
x_vector_only_mode: z.boolean().optional(),
max_new_tokens: z.number().min(128).max(4096).optional(),
temperature: z.number().min(0.1).max(2).optional(),
top_k: z.number().min(1).max(100).optional(),
top_p: z.number().min(0).max(1).optional(),
repetition_penalty: z.number().min(1).max(2).optional(),
})
const [languages, setLanguages] = useState<Language[]>([])
const [isLoading, setIsLoading] = useState(false)
const [advancedOpen, setAdvancedOpen] = useState(false)
const [step, setStep] = useState<1 | 2>(1)
const [inputTab, setInputTab] = useState<'upload' | 'record'>('upload')
const [tempAdvancedParams, setTempAdvancedParams] = useState({
max_new_tokens: 2048
})
const { currentJob, isPolling, isCompleted, startPolling, elapsedTime } = useJobPolling()
const { refresh } = useHistoryContext()
const {
register,
handleSubmit,
setValue,
watch,
control,
trigger,
formState: { errors },
} = useForm<FormData>({
resolver: zodResolver(formSchema),
defaultValues: {
text: '',
language: 'Auto',
ref_text: '',
use_cache: true,
x_vector_only_mode: false,
max_new_tokens: 2048,
temperature: 0.9,
top_k: 50,
top_p: 1.0,
repetition_penalty: 1.05,
} as Partial<FormData>,
})
useEffect(() => {
const fetchData = async () => {
try {
const langs = await ttsApi.getLanguages()
setLanguages(langs)
} catch (error) {
toast.error(t('loadDataFailed'))
}
}
fetchData()
}, [t])
useEffect(() => {
if (inputTab === 'record' && PRESET_REF_TEXTS.length > 0) {
setValue('ref_text', PRESET_REF_TEXTS[0].text)
} else if (inputTab === 'upload') {
setValue('ref_text', '')
}
}, [inputTab, setValue])
const handleNextStep = async () => {
// Validate step 1 fields
const valid = await trigger(['ref_audio', 'ref_text'])
if (valid) {
setStep(2)
}
}
const onSubmit = async (data: FormData) => {
setIsLoading(true)
try {
const result = await ttsApi.createVoiceCloneJob({
...data,
ref_audio: data.ref_audio,
})
toast.success(t('taskCreated'))
startPolling(result.job_id)
try {
await refresh()
} catch { }
} catch (error) {
toast.error(t('taskCreateFailed'))
} finally {
setIsLoading(false)
}
}
const memoizedAudioUrl = useMemo(() => {
if (!currentJob) return ''
return jobApi.getAudioUrl(currentJob.id, currentJob.audio_url)
}, [currentJob?.id, currentJob?.audio_url])
return (
<form onSubmit={handleSubmit(onSubmit)} className="space-y-4">
{/* Steps Indicator */}
<div className="flex items-center justify-center space-x-4 mb-6">
<div className={`flex items-center space-x-2 ${step === 1 ? 'text-primary' : 'text-muted-foreground'}`}>
<div className={`w-8 h-8 rounded-full flex items-center justify-center border-2 ${step === 1 ? 'border-primary bg-primary/10' : 'border-muted'}`}>1</div>
<span className="text-sm font-medium">{tVoice('step1Title')}</span>
</div>
<div className="w-8 h-[2px] bg-muted" />
<div className={`flex items-center space-x-2 ${step === 2 ? 'text-primary' : 'text-muted-foreground'}`}>
<div className={`w-8 h-8 rounded-full flex items-center justify-center border-2 ${step === 2 ? 'border-primary bg-primary/10' : 'border-muted'}`}>2</div>
<span className="text-sm font-medium">{tVoice('step2Title')}</span>
</div>
</div>
<div className={step === 1 ? 'block' : 'hidden'}>
{/* Step 1: Input Selection */}
<Tabs value={inputTab} onValueChange={(v) => setInputTab(v as any)} className="w-full">
<TabsList className="grid w-full grid-cols-2">
<TabsTrigger value="upload" className="flex items-center gap-2">
<FileText className="h-4 w-4" />
{tVoice('uploadTab')}
</TabsTrigger>
<TabsTrigger value="record" className="flex items-center gap-2">
<Mic className="h-4 w-4" />
{tVoice('recordTab')}
</TabsTrigger>
</TabsList>
<TabsContent value="upload" className="space-y-4 mt-4">
<div className="space-y-0.5">
<Label>{tVoice('refAudioLabel')}</Label>
<Controller
name="ref_audio"
control={control}
render={({ field }) => (
<FileUploader
value={field.value}
onChange={field.onChange}
error={errors.ref_audio?.message}
/>
)}
/>
</div>
<div className="space-y-0.5">
<Label>{tVoice('refTextLabel')}</Label>
<Textarea
{...register('ref_text')}
placeholder={tVoice('refTextPlaceholder')}
className="min-h-[100px]"
/>
<PresetSelector
presets={PRESET_REF_TEXTS}
onSelect={(preset) => setValue('ref_text', preset.text)}
/>
</div>
<Button type="button" className="w-full mt-6" onClick={handleNextStep}>
{tVoice('nextStep')}
<ArrowRight className="ml-2 h-4 w-4" />
</Button>
</TabsContent>
<TabsContent value="record" className="space-y-4 mt-4">
<div className="space-y-2">
<Label className="text-base font-medium">{tVoice('readPrompt')}</Label>
<div className="grid grid-cols-3 gap-2">
{PRESET_REF_TEXTS.map((preset, i) => {
const isSelected = watch('ref_text') === preset.text
return (
<div
key={i}
className={`p-3 border rounded-lg hover:bg-accent cursor-pointer transition-colors text-sm text-center ${
isSelected ? 'border-primary bg-primary/10' : ''
}`}
onClick={() => setValue('ref_text', preset.text)}
>
<div className="font-medium">{preset.label}</div>
</div>
)
})}
</div>
<div className="space-y-0.5 pt-2">
<Label>{tVoice('currentRefText')}</Label>
<Textarea
{...register('ref_text')}
placeholder={tVoice('currentRefTextPlaceholder')}
className="min-h-[80px]"
/>
</div>
</div>
{/* Mobile-friendly Bottom Recorder Area */}
<div className="fixed bottom-0 left-0 right-0 p-4 bg-background border-t z-50 md:relative md:border-t-0 md:bg-transparent md:p-0 md:z-0">
<div className="space-y-3">
{watch('ref_audio') && (
<Button type="button" className="w-full" onClick={handleNextStep}>
{tVoice('nextStep')}
<ArrowRight className="ml-2 h-4 w-4" />
</Button>
)}
<Controller
name="ref_audio"
control={control}
render={({ field }) => (
<AudioRecorder
onChange={field.onChange}
/>
)}
/>
{errors.ref_audio && (
<p className="text-sm text-destructive mt-2 text-center md:text-left">{errors.ref_audio.message}</p>
)}
</div>
</div>
{/* Spacer for mobile to prevent content being hidden behind fixed footer */}
<div className="h-24 md:hidden" />
</TabsContent>
</Tabs>
</div>
<div className={step === 2 ? 'block space-y-4' : 'hidden'}>
{/* Step 2: Synthesis Options */}
<div className="space-y-0.5">
<IconLabel icon={Globe2} tooltip={tVoice('languageOptional')} />
<Select
value={watch('language')}
onValueChange={(value: string) => setValue('language', value)}
>
<SelectTrigger>
<SelectValue />
</SelectTrigger>
<SelectContent>
{languages.map((lang) => (
<SelectItem key={lang.code} value={lang.code}>
{tConstants(`languages.${lang.code}`, { defaultValue: lang.name })}
</SelectItem>
))}
</SelectContent>
</Select>
</div>
<div className="space-y-0.5">
<IconLabel icon={Type} tooltip={t('textLabel')} required />
<Textarea
{...register('text')}
placeholder={t('textPlaceholder')}
className="min-h-[120px]"
/>
<PresetSelector
presets={PRESET_REF_TEXTS}
onSelect={(preset) => setValue('text', preset.text)}
/>
{errors.text && (
<p className="text-sm text-destructive">{errors.text.message}</p>
)}
</div>
<div className="flex flex-col sm:flex-row gap-4 pt-2">
<div className="flex items-center space-x-2">
<Checkbox
id="x_vector_only_mode"
checked={watch('x_vector_only_mode')}
onCheckedChange={(c) => setValue('x_vector_only_mode', c as boolean)}
/>
<Label htmlFor="x_vector_only_mode" className="text-sm font-normal cursor-pointer">
{tVoice('fastMode')}
</Label>
</div>
<div className="flex items-center space-x-2">
<Checkbox
id="use_cache"
checked={watch('use_cache')}
onCheckedChange={(c) => setValue('use_cache', c as boolean)}
/>
<Label htmlFor="use_cache" className="text-sm font-normal cursor-pointer">
{tVoice('useCache')}
</Label>
</div>
</div>
<Dialog open={advancedOpen} onOpenChange={(open) => {
if (open) {
setTempAdvancedParams({
max_new_tokens: watch('max_new_tokens') || 2048
})
}
setAdvancedOpen(open)
}}>
<DialogTrigger asChild>
<Button type="button" variant="outline" className="w-full">
<Settings className="mr-2 h-4 w-4" />
{t('advancedOptions')}
</Button>
</DialogTrigger>
<DialogContent className="sm:max-w-[500px]">
<DialogHeader>
<DialogTitle>{t('advancedOptionsTitle')}</DialogTitle>
<DialogDescription>{t('advancedOptionsDescription')}</DialogDescription>
</DialogHeader>
<div className="space-y-4 py-4">
<div className="space-y-2">
<Label htmlFor="dialog-max_new_tokens">
{t('advancedParams.maxNewTokens.label')}
</Label>
<Input
id="dialog-max_new_tokens"
type="number"
min={128}
max={4096}
value={tempAdvancedParams.max_new_tokens}
onChange={(e) => setTempAdvancedParams({
...tempAdvancedParams,
max_new_tokens: parseInt(e.target.value) || 2048
})}
/>
<p className="text-sm text-muted-foreground">
{t('advancedParams.maxNewTokens.description')}
</p>
</div>
</div>
<DialogFooter>
<Button
type="button"
variant="outline"
onClick={() => {
setAdvancedOpen(false)
}}
>
{tCommon('cancel')}
</Button>
<Button
type="button"
onClick={() => {
setValue('max_new_tokens', tempAdvancedParams.max_new_tokens)
setAdvancedOpen(false)
}}
>
{tCommon('ok')}
</Button>
</DialogFooter>
</DialogContent>
</Dialog>
<div className="flex gap-3 pt-4">
<Button type="button" variant="outline" onClick={() => setStep(1)} className="w-1/3">
<ArrowLeft className="mr-2 h-4 w-4" />
{tVoice('prevStep')}
</Button>
<TooltipProvider>
<Tooltip>
<TooltipTrigger asChild>
<Button type="submit" className="flex-1" disabled={isLoading || isPolling}>
<Play className="mr-2 h-4 w-4" />
{isLoading ? t('creating') : t('generate')}
</Button>
</TooltipTrigger>
<TooltipContent>
<p>{t('generate')}</p>
</TooltipContent>
</Tooltip>
</TooltipProvider>
</div>
</div>
{isPolling && <LoadingState elapsedTime={elapsedTime} />}
{isCompleted && currentJob && (
<div className="space-y-4 pt-4 border-t">
<AudioPlayer
audioUrl={memoizedAudioUrl}
jobId={currentJob.id}
text={currentJob.parameters?.text}
/>
</div>
)}
</form>
)
}
export default VoiceCloneForm