feat: Implement a multi-step voice cloning form with dedicated file upload and audio recording inputs.

This commit is contained in:
2026-02-03 14:29:31 +08:00
parent 117a51ca77
commit 5a5c93f075

View File

@@ -9,7 +9,7 @@ import { Select, SelectContent, SelectItem, SelectTrigger, SelectValue } from '@
import { Dialog, DialogContent, DialogHeader, DialogTitle, DialogTrigger, DialogFooter } from '@/components/ui/dialog' import { Dialog, DialogContent, DialogHeader, DialogTitle, DialogTrigger, DialogFooter } from '@/components/ui/dialog'
import { Checkbox } from '@/components/ui/checkbox' import { Checkbox } from '@/components/ui/checkbox'
import { Label } from '@/components/ui/label' import { Label } from '@/components/ui/label'
import { Settings, Globe2, Type, Play, FileText, Mic, Zap, Database } from 'lucide-react' import { Settings, Globe2, Type, Play, FileText, Mic, Zap, Database, ArrowRight, ArrowLeft } from 'lucide-react'
import { toast } from 'sonner' import { toast } from 'sonner'
import { IconLabel } from '@/components/IconLabel' import { IconLabel } from '@/components/IconLabel'
import { Tooltip, TooltipContent, TooltipProvider, TooltipTrigger } from '@/components/ui/tooltip' import { Tooltip, TooltipContent, TooltipProvider, TooltipTrigger } from '@/components/ui/tooltip'
@@ -18,11 +18,12 @@ import { useJobPolling } from '@/hooks/useJobPolling'
import { useHistoryContext } from '@/contexts/HistoryContext' import { useHistoryContext } from '@/contexts/HistoryContext'
import { LoadingState } from '@/components/LoadingState' import { LoadingState } from '@/components/LoadingState'
import { AudioPlayer } from '@/components/AudioPlayer' import { AudioPlayer } from '@/components/AudioPlayer'
import { AudioInputSelector } from '@/components/AudioInputSelector' import { FileUploader } from '@/components/FileUploader'
import { AudioRecorder } from '@/components/AudioRecorder'
import { PresetSelector } from '@/components/PresetSelector' import { PresetSelector } from '@/components/PresetSelector'
import { ParamInput } from '@/components/ParamInput'
import { PRESET_REF_TEXTS, ADVANCED_PARAMS_INFO } from '@/lib/constants' import { PRESET_REF_TEXTS, ADVANCED_PARAMS_INFO } from '@/lib/constants'
import type { Language } from '@/types/tts' import type { Language } from '@/types/tts'
import { Tabs, TabsContent, TabsList, TabsTrigger } from '@/components/ui/tabs'
const formSchema = z.object({ const formSchema = z.object({
text: z.string().min(1, '请输入要合成的文本').max(5000, '文本长度不能超过 5000 字符'), text: z.string().min(1, '请输入要合成的文本').max(5000, '文本长度不能超过 5000 字符'),
@@ -44,6 +45,8 @@ function VoiceCloneForm() {
const [languages, setLanguages] = useState<Language[]>([]) const [languages, setLanguages] = useState<Language[]>([])
const [isLoading, setIsLoading] = useState(false) const [isLoading, setIsLoading] = useState(false)
const [advancedOpen, setAdvancedOpen] = useState(false) const [advancedOpen, setAdvancedOpen] = useState(false)
const [step, setStep] = useState<1 | 2>(1)
const [inputTab, setInputTab] = useState<'upload' | 'record'>('upload')
const [tempAdvancedParams, setTempAdvancedParams] = useState({ const [tempAdvancedParams, setTempAdvancedParams] = useState({
max_new_tokens: 2048 max_new_tokens: 2048
}) })
@@ -57,6 +60,7 @@ function VoiceCloneForm() {
setValue, setValue,
watch, watch,
control, control,
trigger,
formState: { errors }, formState: { errors },
} = useForm<FormData>({ } = useForm<FormData>({
resolver: zodResolver(formSchema), resolver: zodResolver(formSchema),
@@ -86,6 +90,14 @@ function VoiceCloneForm() {
fetchData() fetchData()
}, []) }, [])
const handleNextStep = async () => {
// Validate step 1 fields
const valid = await trigger(['ref_audio', 'ref_text'])
if (valid) {
setStep(2)
}
}
const onSubmit = async (data: FormData) => { const onSubmit = async (data: FormData) => {
setIsLoading(true) setIsLoading(true)
try { try {
@@ -111,30 +123,42 @@ function VoiceCloneForm() {
}, [currentJob?.id, currentJob?.audio_url]) }, [currentJob?.id, currentJob?.audio_url])
return ( return (
<form onSubmit={handleSubmit(onSubmit)} className="space-y-2"> <form onSubmit={handleSubmit(onSubmit)} className="space-y-4">
<div className="space-y-0.5"> {/* Steps Indicator */}
<IconLabel icon={FileText} tooltip="参考文稿(可选)" /> <div className="flex items-center justify-center space-x-4 mb-6">
<Textarea <div className={`flex items-center space-x-2 ${step === 1 ? 'text-primary' : 'text-muted-foreground'}`}>
{...register('ref_text')} <div className={`w-8 h-8 rounded-full flex items-center justify-center border-2 ${step === 1 ? 'border-primary bg-primary/10' : 'border-muted'}`}>1</div>
placeholder="参考音频对应的文本..." <span className="text-sm font-medium"></span>
className="min-h-[40px] md:min-h-[60px]" </div>
/> <div className="w-8 h-[2px] bg-muted" />
<PresetSelector <div className={`flex items-center space-x-2 ${step === 2 ? 'text-primary' : 'text-muted-foreground'}`}>
presets={PRESET_REF_TEXTS} <div className={`w-8 h-8 rounded-full flex items-center justify-center border-2 ${step === 2 ? 'border-primary bg-primary/10' : 'border-muted'}`}>2</div>
onSelect={(preset) => setValue('ref_text', preset.text)} <span className="text-sm font-medium"></span>
/> </div>
{errors.ref_text && (
<p className="text-sm text-destructive">{errors.ref_text.message}</p>
)}
</div> </div>
<div className={step === 1 ? 'block' : 'hidden'}>
{/* Step 1: Input Selection */}
<Tabs value={inputTab} onValueChange={(v) => setInputTab(v as any)} className="w-full">
<TabsList className="grid w-full grid-cols-2">
<TabsTrigger value="upload" className="flex items-center gap-2">
<FileText className="h-4 w-4" />
</TabsTrigger>
<TabsTrigger value="record" className="flex items-center gap-2">
<Mic className="h-4 w-4" />
线
</TabsTrigger>
</TabsList>
<TabsContent value="upload" className="space-y-4 mt-4">
<div className="space-y-0.5"> <div className="space-y-0.5">
<IconLabel icon={Mic} tooltip="参考音频" required /> <Label></Label>
<Controller <Controller
name="ref_audio" name="ref_audio"
control={control} control={control}
render={({ field }) => ( render={({ field }) => (
<AudioInputSelector <FileUploader
value={field.value} value={field.value}
onChange={field.onChange} onChange={field.onChange}
error={errors.ref_audio?.message} error={errors.ref_audio?.message}
@@ -142,7 +166,73 @@ function VoiceCloneForm() {
)} )}
/> />
</div> </div>
<div className="space-y-0.5">
<Label>稿</Label>
<Textarea
{...register('ref_text')}
placeholder="参考音频对应的文本内容..."
className="min-h-[100px]"
/>
<PresetSelector
presets={PRESET_REF_TEXTS}
onSelect={(preset) => setValue('ref_text', preset.text)}
/>
</div>
</TabsContent>
<TabsContent value="record" className="space-y-4 mt-4">
<div className="space-y-2">
<Label className="text-base font-medium"></Label>
<div className="grid gap-2">
{PRESET_REF_TEXTS.map((preset, i) => (
<div
key={i}
className="p-3 border rounded-lg hover:bg-accent cursor-pointer transition-colors text-sm"
onClick={() => setValue('ref_text', preset.text)}
>
<div className="font-medium mb-1">{preset.label}</div>
<div className="text-muted-foreground line-clamp-2">{preset.text}</div>
</div>
))}
</div>
<div className="space-y-0.5 pt-2">
<Label></Label>
<Textarea
{...register('ref_text')}
placeholder="选中的文本将显示在这里..."
className="min-h-[80px]"
/>
</div>
</div>
{/* Mobile-friendly Bottom Recorder Area */}
<div className="fixed bottom-0 left-0 right-0 p-4 bg-background border-t z-50 md:relative md:border-t-0 md:bg-transparent md:p-0 md:z-0">
<Controller
name="ref_audio"
control={control}
render={({ field }) => (
<AudioRecorder
onChange={field.onChange}
/>
)}
/>
{errors.ref_audio && (
<p className="text-sm text-destructive mt-2 text-center md:text-left">{errors.ref_audio.message}</p>
)}
</div>
{/* Spacer for mobile to prevent content being hidden behind fixed footer */}
<div className="h-24 md:hidden" />
</TabsContent>
</Tabs>
<Button type="button" className="w-full mt-6" onClick={handleNextStep}>
<ArrowRight className="ml-2 h-4 w-4" />
</Button>
</div>
<div className={step === 2 ? 'block space-y-4' : 'hidden'}>
{/* Step 2: Synthesis Options */}
<div className="space-y-0.5"> <div className="space-y-0.5">
<IconLabel icon={Globe2} tooltip="语言(可选)" /> <IconLabel icon={Globe2} tooltip="语言(可选)" />
<Select <Select
@@ -167,7 +257,7 @@ function VoiceCloneForm() {
<Textarea <Textarea
{...register('text')} {...register('text')}
placeholder="输入要合成的文本..." placeholder="输入要合成的文本..."
className="min-h-[40px] md:min-h-[60px]" className="min-h-[120px]"
/> />
<PresetSelector <PresetSelector
presets={PRESET_REF_TEXTS} presets={PRESET_REF_TEXTS}
@@ -178,39 +268,25 @@ function VoiceCloneForm() {
)} )}
</div> </div>
<div className="flex items-center space-x-3"> <div className="flex flex-col sm:flex-row gap-4 pt-2">
<div className="flex items-center space-x-2"> <div className="flex items-center space-x-2">
<Zap className="h-4 w-4 text-muted-foreground" />
<Controller
name="x_vector_only_mode"
control={control}
render={({ field }) => (
<Checkbox <Checkbox
id="x_vector_only_mode" id="x_vector_only_mode"
checked={field.value} checked={watch('x_vector_only_mode')}
onCheckedChange={field.onChange} onCheckedChange={(c) => setValue('x_vector_only_mode', c as boolean)}
/> />
)} <Label htmlFor="x_vector_only_mode" className="text-sm font-normal cursor-pointer">
/>
<Label htmlFor="x_vector_only_mode" className="text-sm font-normal">
</Label> </Label>
</div> </div>
<div className="flex items-center space-x-2"> <div className="flex items-center space-x-2">
<Database className="h-4 w-4 text-muted-foreground" />
<Controller
name="use_cache"
control={control}
render={({ field }) => (
<Checkbox <Checkbox
id="use_cache" id="use_cache"
checked={field.value} checked={watch('use_cache')}
onCheckedChange={field.onChange} onCheckedChange={(c) => setValue('use_cache', c as boolean)}
/> />
)} <Label htmlFor="use_cache" className="text-sm font-normal cursor-pointer">
/>
<Label htmlFor="use_cache" className="text-sm font-normal">
使 使
</Label> </Label>
</div> </div>
@@ -219,7 +295,7 @@ function VoiceCloneForm() {
<Dialog open={advancedOpen} onOpenChange={(open) => { <Dialog open={advancedOpen} onOpenChange={(open) => {
if (open) { if (open) {
setTempAdvancedParams({ setTempAdvancedParams({
max_new_tokens: watch('max_new_tokens') max_new_tokens: watch('max_new_tokens') || 2048
}) })
} }
setAdvancedOpen(open) setAdvancedOpen(open)
@@ -260,7 +336,6 @@ function VoiceCloneForm() {
type="button" type="button"
variant="outline" variant="outline"
onClick={() => { onClick={() => {
setTempAdvancedParams({ max_new_tokens: watch('max_new_tokens') })
setAdvancedOpen(false) setAdvancedOpen(false)
}} }}
> >
@@ -279,10 +354,15 @@ function VoiceCloneForm() {
</DialogContent> </DialogContent>
</Dialog> </Dialog>
<div className="flex gap-3 pt-4">
<Button type="button" variant="outline" onClick={() => setStep(1)} className="w-1/3">
<ArrowLeft className="mr-2 h-4 w-4" />
</Button>
<TooltipProvider> <TooltipProvider>
<Tooltip> <Tooltip>
<TooltipTrigger asChild> <TooltipTrigger asChild>
<Button type="submit" className="w-full" disabled={isLoading || isPolling}> <Button type="submit" className="flex-1" disabled={isLoading || isPolling}>
<Play className="mr-2 h-4 w-4" /> <Play className="mr-2 h-4 w-4" />
{isLoading ? '创建中...' : '生成语音'} {isLoading ? '创建中...' : '生成语音'}
</Button> </Button>
@@ -292,6 +372,8 @@ function VoiceCloneForm() {
</TooltipContent> </TooltipContent>
</Tooltip> </Tooltip>
</TooltipProvider> </TooltipProvider>
</div>
</div>
{isPolling && <LoadingState elapsedTime={elapsedTime} />} {isPolling && <LoadingState elapsedTime={elapsedTime} />}