434 lines
16 KiB
TypeScript
434 lines
16 KiB
TypeScript
import { useForm, Controller } from 'react-hook-form'
|
|
import { zodResolver } from '@hookform/resolvers/zod'
|
|
import * as z from 'zod'
|
|
import { useEffect, useState, useMemo } from 'react'
|
|
import { useTranslation } from 'react-i18next'
|
|
import { Button } from '@/components/ui/button'
|
|
import { Input } from '@/components/ui/input'
|
|
import { Textarea } from '@/components/ui/textarea'
|
|
import { Select, SelectContent, SelectItem, SelectTrigger, SelectValue } from '@/components/ui/select'
|
|
import { Dialog, DialogContent, DialogDescription, DialogHeader, DialogTitle, DialogTrigger, DialogFooter } from '@/components/ui/dialog'
|
|
import { Checkbox } from '@/components/ui/checkbox'
|
|
import { Label } from '@/components/ui/label'
|
|
import { Settings, Globe2, Type, Play, FileText, Mic, ArrowRight, ArrowLeft } from 'lucide-react'
|
|
import { toast } from 'sonner'
|
|
import { IconLabel } from '@/components/IconLabel'
|
|
import { Tooltip, TooltipContent, TooltipProvider, TooltipTrigger } from '@/components/ui/tooltip'
|
|
import { ttsApi, jobApi } from '@/lib/api'
|
|
import { useJobPolling } from '@/hooks/useJobPolling'
|
|
import { useHistoryContext } from '@/contexts/HistoryContext'
|
|
import { LoadingState } from '@/components/LoadingState'
|
|
import { AudioPlayer } from '@/components/AudioPlayer'
|
|
import { FileUploader } from '@/components/FileUploader'
|
|
import { AudioRecorder } from '@/components/AudioRecorder'
|
|
import { PresetSelector } from '@/components/PresetSelector'
|
|
import type { Language } from '@/types/tts'
|
|
import { Tabs, TabsContent, TabsList, TabsTrigger } from '@/components/ui/tabs'
|
|
|
|
type FormData = {
|
|
text: string
|
|
language?: string
|
|
ref_audio: File
|
|
ref_text?: string
|
|
use_cache?: boolean
|
|
x_vector_only_mode?: boolean
|
|
max_new_tokens?: number
|
|
temperature?: number
|
|
top_k?: number
|
|
top_p?: number
|
|
repetition_penalty?: number
|
|
}
|
|
|
|
function VoiceCloneForm() {
|
|
const { t } = useTranslation('tts')
|
|
const { t: tCommon } = useTranslation('common')
|
|
const { t: tVoice } = useTranslation('voice')
|
|
const { t: tErrors } = useTranslation('errors')
|
|
const { t: tConstants } = useTranslation('constants')
|
|
|
|
const PRESET_REF_TEXTS = useMemo(() => tConstants('presetRefTexts', { returnObjects: true }) as Array<{ label: string; text: string }>, [tConstants])
|
|
|
|
const formSchema = z.object({
|
|
text: z.string().min(1, tErrors('validation.required', { field: tErrors('fieldNames.text') })).max(1000, tErrors('validation.maxLength', { field: tErrors('fieldNames.text'), max: 1000 })),
|
|
language: z.string().optional(),
|
|
ref_audio: z.instanceof(File, { message: tErrors('validation.required', { field: tErrors('fieldNames.reference_audio') }) }),
|
|
ref_text: z.string().optional(),
|
|
use_cache: z.boolean().optional(),
|
|
x_vector_only_mode: z.boolean().optional(),
|
|
max_new_tokens: z.number().min(128).max(4096).optional(),
|
|
temperature: z.number().min(0.1).max(2).optional(),
|
|
top_k: z.number().min(1).max(100).optional(),
|
|
top_p: z.number().min(0).max(1).optional(),
|
|
repetition_penalty: z.number().min(1).max(2).optional(),
|
|
})
|
|
const [languages, setLanguages] = useState<Language[]>([])
|
|
const [isLoading, setIsLoading] = useState(false)
|
|
const [advancedOpen, setAdvancedOpen] = useState(false)
|
|
const [step, setStep] = useState<1 | 2>(1)
|
|
const [inputTab, setInputTab] = useState<'upload' | 'record'>('upload')
|
|
const [tempAdvancedParams, setTempAdvancedParams] = useState({
|
|
max_new_tokens: 2048
|
|
})
|
|
|
|
const { currentJob, isPolling, isCompleted, startPolling, elapsedTime } = useJobPolling()
|
|
const { refresh } = useHistoryContext()
|
|
|
|
const {
|
|
register,
|
|
handleSubmit,
|
|
setValue,
|
|
watch,
|
|
control,
|
|
trigger,
|
|
formState: { errors },
|
|
} = useForm<FormData>({
|
|
resolver: zodResolver(formSchema),
|
|
defaultValues: {
|
|
text: '',
|
|
language: 'Auto',
|
|
ref_text: '',
|
|
use_cache: true,
|
|
x_vector_only_mode: false,
|
|
max_new_tokens: 2048,
|
|
temperature: 0.9,
|
|
top_k: 50,
|
|
top_p: 1.0,
|
|
repetition_penalty: 1.05,
|
|
} as Partial<FormData>,
|
|
})
|
|
|
|
useEffect(() => {
|
|
const fetchData = async () => {
|
|
try {
|
|
const langs = await ttsApi.getLanguages()
|
|
setLanguages(langs)
|
|
} catch (error) {
|
|
toast.error(t('loadDataFailed'))
|
|
}
|
|
}
|
|
fetchData()
|
|
}, [t])
|
|
|
|
useEffect(() => {
|
|
if (inputTab === 'record' && PRESET_REF_TEXTS.length > 0) {
|
|
setValue('ref_text', PRESET_REF_TEXTS[0].text)
|
|
} else if (inputTab === 'upload') {
|
|
setValue('ref_text', '')
|
|
}
|
|
}, [inputTab, setValue])
|
|
|
|
const handleNextStep = async () => {
|
|
// Validate step 1 fields
|
|
const valid = await trigger(['ref_audio', 'ref_text'])
|
|
if (valid) {
|
|
setStep(2)
|
|
}
|
|
}
|
|
|
|
const onSubmit = async (data: FormData) => {
|
|
setIsLoading(true)
|
|
try {
|
|
const result = await ttsApi.createVoiceCloneJob({
|
|
...data,
|
|
ref_audio: data.ref_audio,
|
|
})
|
|
toast.success(t('taskCreated'))
|
|
startPolling(result.job_id)
|
|
try {
|
|
await refresh()
|
|
} catch { }
|
|
} catch (error) {
|
|
toast.error(t('taskCreateFailed'))
|
|
} finally {
|
|
setIsLoading(false)
|
|
}
|
|
}
|
|
|
|
const memoizedAudioUrl = useMemo(() => {
|
|
if (!currentJob) return ''
|
|
return jobApi.getAudioUrl(currentJob.id, currentJob.audio_url)
|
|
}, [currentJob?.id, currentJob?.audio_url])
|
|
|
|
return (
|
|
<form onSubmit={handleSubmit(onSubmit)} className="space-y-4">
|
|
{/* Steps Indicator */}
|
|
<div className="flex items-center justify-center space-x-4 mb-6">
|
|
<div className={`flex items-center space-x-2 ${step === 1 ? 'text-primary' : 'text-muted-foreground'}`}>
|
|
<div className={`w-8 h-8 rounded-full flex items-center justify-center border-2 ${step === 1 ? 'border-primary bg-primary/10' : 'border-muted'}`}>1</div>
|
|
<span className="text-sm font-medium">{tVoice('step1Title')}</span>
|
|
</div>
|
|
<div className="w-8 h-[2px] bg-muted" />
|
|
<div className={`flex items-center space-x-2 ${step === 2 ? 'text-primary' : 'text-muted-foreground'}`}>
|
|
<div className={`w-8 h-8 rounded-full flex items-center justify-center border-2 ${step === 2 ? 'border-primary bg-primary/10' : 'border-muted'}`}>2</div>
|
|
<span className="text-sm font-medium">{tVoice('step2Title')}</span>
|
|
</div>
|
|
</div>
|
|
|
|
<div className={step === 1 ? 'block' : 'hidden'}>
|
|
{/* Step 1: Input Selection */}
|
|
<Tabs value={inputTab} onValueChange={(v) => setInputTab(v as any)} className="w-full">
|
|
<TabsList className="grid w-full grid-cols-2">
|
|
<TabsTrigger value="upload" className="flex items-center gap-2">
|
|
<FileText className="h-4 w-4" />
|
|
{tVoice('uploadTab')}
|
|
</TabsTrigger>
|
|
<TabsTrigger value="record" className="flex items-center gap-2">
|
|
<Mic className="h-4 w-4" />
|
|
{tVoice('recordTab')}
|
|
</TabsTrigger>
|
|
</TabsList>
|
|
|
|
<TabsContent value="upload" className="space-y-4 mt-4">
|
|
<div className="space-y-0.5">
|
|
<Label>{tVoice('refAudioLabel')}</Label>
|
|
<Controller
|
|
name="ref_audio"
|
|
control={control}
|
|
render={({ field }) => (
|
|
<FileUploader
|
|
value={field.value}
|
|
onChange={field.onChange}
|
|
error={errors.ref_audio?.message}
|
|
/>
|
|
)}
|
|
/>
|
|
</div>
|
|
<div className="space-y-0.5">
|
|
<Label>{tVoice('refTextLabel')}</Label>
|
|
<Textarea
|
|
{...register('ref_text')}
|
|
placeholder={tVoice('refTextPlaceholder')}
|
|
className="min-h-[100px]"
|
|
/>
|
|
<PresetSelector
|
|
presets={PRESET_REF_TEXTS}
|
|
onSelect={(preset) => setValue('ref_text', preset.text)}
|
|
/>
|
|
</div>
|
|
|
|
<Button type="button" className="w-full mt-6" onClick={handleNextStep}>
|
|
{tVoice('nextStep')}
|
|
<ArrowRight className="ml-2 h-4 w-4" />
|
|
</Button>
|
|
</TabsContent>
|
|
|
|
<TabsContent value="record" className="space-y-4 mt-4">
|
|
<div className="space-y-2">
|
|
<Label className="text-base font-medium">{tVoice('readPrompt')}</Label>
|
|
<div className="grid grid-cols-3 gap-2">
|
|
{PRESET_REF_TEXTS.map((preset, i) => {
|
|
const isSelected = watch('ref_text') === preset.text
|
|
return (
|
|
<div
|
|
key={i}
|
|
className={`p-3 border rounded-lg hover:bg-accent cursor-pointer transition-colors text-sm text-center ${
|
|
isSelected ? 'border-primary bg-primary/10' : ''
|
|
}`}
|
|
onClick={() => setValue('ref_text', preset.text)}
|
|
>
|
|
<div className="font-medium">{preset.label}</div>
|
|
</div>
|
|
)
|
|
})}
|
|
</div>
|
|
<div className="space-y-0.5 pt-2">
|
|
<Label>{tVoice('currentRefText')}</Label>
|
|
<Textarea
|
|
{...register('ref_text')}
|
|
placeholder={tVoice('currentRefTextPlaceholder')}
|
|
className="min-h-[80px]"
|
|
/>
|
|
</div>
|
|
</div>
|
|
|
|
{/* Mobile-friendly Bottom Recorder Area */}
|
|
<div className="fixed bottom-0 left-0 right-0 p-4 bg-background border-t z-50 md:relative md:border-t-0 md:bg-transparent md:p-0 md:z-0">
|
|
<div className="space-y-3">
|
|
{watch('ref_audio') && (
|
|
<Button type="button" className="w-full" onClick={handleNextStep}>
|
|
{tVoice('nextStep')}
|
|
<ArrowRight className="ml-2 h-4 w-4" />
|
|
</Button>
|
|
)}
|
|
<Controller
|
|
name="ref_audio"
|
|
control={control}
|
|
render={({ field }) => (
|
|
<AudioRecorder
|
|
onChange={field.onChange}
|
|
/>
|
|
)}
|
|
/>
|
|
{errors.ref_audio && (
|
|
<p className="text-sm text-destructive mt-2 text-center md:text-left">{errors.ref_audio.message}</p>
|
|
)}
|
|
</div>
|
|
</div>
|
|
{/* Spacer for mobile to prevent content being hidden behind fixed footer */}
|
|
<div className="h-24 md:hidden" />
|
|
</TabsContent>
|
|
</Tabs>
|
|
</div>
|
|
|
|
<div className={step === 2 ? 'block space-y-4' : 'hidden'}>
|
|
{/* Step 2: Synthesis Options */}
|
|
<div className="space-y-0.5">
|
|
<IconLabel icon={Globe2} tooltip={tVoice('languageOptional')} />
|
|
<Select
|
|
value={watch('language')}
|
|
onValueChange={(value: string) => setValue('language', value)}
|
|
>
|
|
<SelectTrigger>
|
|
<SelectValue />
|
|
</SelectTrigger>
|
|
<SelectContent>
|
|
{languages.map((lang) => (
|
|
<SelectItem key={lang.code} value={lang.code}>
|
|
{tConstants(`languages.${lang.code}`, { defaultValue: lang.name })}
|
|
</SelectItem>
|
|
))}
|
|
</SelectContent>
|
|
</Select>
|
|
</div>
|
|
|
|
<div className="space-y-0.5">
|
|
<IconLabel icon={Type} tooltip={t('textLabel')} required />
|
|
<Textarea
|
|
{...register('text')}
|
|
placeholder={t('textPlaceholder')}
|
|
className="min-h-[120px]"
|
|
/>
|
|
<PresetSelector
|
|
presets={PRESET_REF_TEXTS}
|
|
onSelect={(preset) => setValue('text', preset.text)}
|
|
/>
|
|
{errors.text && (
|
|
<p className="text-sm text-destructive">{errors.text.message}</p>
|
|
)}
|
|
</div>
|
|
|
|
<div className="flex flex-col sm:flex-row gap-4 pt-2">
|
|
<div className="flex items-center space-x-2">
|
|
<Checkbox
|
|
id="x_vector_only_mode"
|
|
checked={watch('x_vector_only_mode')}
|
|
onCheckedChange={(c) => setValue('x_vector_only_mode', c as boolean)}
|
|
/>
|
|
<Label htmlFor="x_vector_only_mode" className="text-sm font-normal cursor-pointer">
|
|
{tVoice('fastMode')}
|
|
</Label>
|
|
</div>
|
|
|
|
<div className="flex items-center space-x-2">
|
|
<Checkbox
|
|
id="use_cache"
|
|
checked={watch('use_cache')}
|
|
onCheckedChange={(c) => setValue('use_cache', c as boolean)}
|
|
/>
|
|
<Label htmlFor="use_cache" className="text-sm font-normal cursor-pointer">
|
|
{tVoice('useCache')}
|
|
</Label>
|
|
</div>
|
|
</div>
|
|
|
|
<Dialog open={advancedOpen} onOpenChange={(open) => {
|
|
if (open) {
|
|
setTempAdvancedParams({
|
|
max_new_tokens: watch('max_new_tokens') || 2048
|
|
})
|
|
}
|
|
setAdvancedOpen(open)
|
|
}}>
|
|
<DialogTrigger asChild>
|
|
<Button type="button" variant="outline" className="w-full">
|
|
<Settings className="mr-2 h-4 w-4" />
|
|
{t('advancedOptions')}
|
|
</Button>
|
|
</DialogTrigger>
|
|
<DialogContent className="sm:max-w-[500px]">
|
|
<DialogHeader>
|
|
<DialogTitle>{t('advancedOptionsTitle')}</DialogTitle>
|
|
<DialogDescription>{t('advancedOptionsDescription')}</DialogDescription>
|
|
</DialogHeader>
|
|
<div className="space-y-4 py-4">
|
|
<div className="space-y-2">
|
|
<Label htmlFor="dialog-max_new_tokens">
|
|
{t('advancedParams.maxNewTokens.label')}
|
|
</Label>
|
|
<Input
|
|
id="dialog-max_new_tokens"
|
|
type="number"
|
|
min={128}
|
|
max={4096}
|
|
value={tempAdvancedParams.max_new_tokens}
|
|
onChange={(e) => setTempAdvancedParams({
|
|
...tempAdvancedParams,
|
|
max_new_tokens: parseInt(e.target.value) || 2048
|
|
})}
|
|
/>
|
|
<p className="text-sm text-muted-foreground">
|
|
{t('advancedParams.maxNewTokens.description')}
|
|
</p>
|
|
</div>
|
|
</div>
|
|
<DialogFooter>
|
|
<Button
|
|
type="button"
|
|
variant="outline"
|
|
onClick={() => {
|
|
setAdvancedOpen(false)
|
|
}}
|
|
>
|
|
{tCommon('cancel')}
|
|
</Button>
|
|
<Button
|
|
type="button"
|
|
onClick={() => {
|
|
setValue('max_new_tokens', tempAdvancedParams.max_new_tokens)
|
|
setAdvancedOpen(false)
|
|
}}
|
|
>
|
|
{tCommon('ok')}
|
|
</Button>
|
|
</DialogFooter>
|
|
</DialogContent>
|
|
</Dialog>
|
|
|
|
<div className="flex gap-3 pt-4">
|
|
<Button type="button" variant="outline" onClick={() => setStep(1)} className="w-1/3">
|
|
<ArrowLeft className="mr-2 h-4 w-4" />
|
|
{tVoice('prevStep')}
|
|
</Button>
|
|
<TooltipProvider>
|
|
<Tooltip>
|
|
<TooltipTrigger asChild>
|
|
<Button type="submit" className="flex-1" disabled={isLoading || isPolling}>
|
|
<Play className="mr-2 h-4 w-4" />
|
|
{isLoading ? t('creating') : t('generate')}
|
|
</Button>
|
|
</TooltipTrigger>
|
|
<TooltipContent>
|
|
<p>{t('generate')}</p>
|
|
</TooltipContent>
|
|
</Tooltip>
|
|
</TooltipProvider>
|
|
</div>
|
|
</div>
|
|
|
|
{isPolling && <LoadingState elapsedTime={elapsedTime} />}
|
|
|
|
{isCompleted && currentJob && (
|
|
<div className="space-y-4 pt-4 border-t">
|
|
<AudioPlayer
|
|
audioUrl={memoizedAudioUrl}
|
|
jobId={currentJob.id}
|
|
text={currentJob.parameters?.text}
|
|
/>
|
|
</div>
|
|
)}
|
|
</form>
|
|
)
|
|
}
|
|
|
|
export default VoiceCloneForm
|