Bulk voice over automation for video assets Go
👤 Sharing: AI
```go
package main
import (
"bytes"
"encoding/json"
"fmt"
"io"
"log"
"net/http"
"os"
"path/filepath"
"strings"
"github.com/joho/godotenv" // For loading .env files. Install: go get github.com/joho/godotenv
)
// Config struct to hold API keys and other settings
type Config struct {
ElevenLabsAPIKey string // Example: your_elevenlabs_api_key
OpenAIAPIKey string // Example: your_openai_api_key
VoiceID string // Example: your_elevenlabs_voice_id (pre-selected voice)
OutputDirectory string // Example: "output_audio"
}
// VideoAsset represents a single video file and its associated metadata
type VideoAsset struct {
Filename string // e.g., "intro_scene.mp4"
Transcript string // The script for the voiceover
AudioFile string // The generated audio file name
}
// ChatGPTRequestPayload represents the structure of the JSON payload
// that will be sent to the ChatGPT (OpenAI) API. This helps define
// the expected input format for the API request. We want to generate a script for our video.
type ChatGPTRequestPayload struct {
Model string `json:"model"`
Messages []Message `json:"messages"`
}
// Message struct for OpenAI API. Each message has a role and its content.
type Message struct {
Role string `json:"role"`
Content string `json:"content"`
}
// ElevenLabsRequestPayload is used when making the API request to ElevenLabs.
// It specifies the settings for the text-to-speech conversion,
// such as voice stability and similarity enhancement.
type ElevenLabsRequestPayload struct {
Text string `json:"text"`
ModelID string `json:"model_id"` // Optional - specify model. Can be "" for default.
VoiceSettings VoiceSettings `json:"voice_settings"`
}
// VoiceSettings allows fine-tuning the voice characteristics when using ElevenLabs.
type VoiceSettings struct {
Stability float64 `json:"stability"`
SimilarityBoost float64 `json:"similarity_boost"`
}
func main() {
// Load environment variables from .env file
err := godotenv.Load(".env")
if err != nil {
log.Fatalf("Error loading .env file: %v. Make sure you have a .env file in the same directory.", err)
}
// Initialize configuration
config := Config{
ElevenLabsAPIKey: os.Getenv("ELEVENLABS_API_KEY"),
OpenAIAPIKey: os.Getenv("OPENAI_API_KEY"),
VoiceID: os.Getenv("ELEVENLABS_VOICE_ID"), // Specific voice ID from ElevenLabs.
OutputDirectory: "output_audio", // Where generated audio will be saved. Make sure it exists!
}
// Validate configuration
if config.ElevenLabsAPIKey == "" || config.OpenAIAPIKey == "" || config.VoiceID == "" {
log.Fatal("ElevenLabs API key, OpenAI API key, or Voice ID not found in .env file.")
}
// Create the output directory if it doesn't exist
if _, err := os.Stat(config.OutputDirectory); os.IsNotExist(err) {
err := os.Mkdir(config.OutputDirectory, 0755) // 0755 = read/write/execute for owner, read/execute for group/others.
if err != nil {
log.Fatalf("Error creating output directory: %v", err)
}
fmt.Printf("Created output directory: %s\n", config.OutputDirectory)
}
// Define video assets. In a real app, this might come from a database or file list.
videoAssets := []VideoAsset{
{Filename: "intro_scene.mp4"},
{Filename: "product_demo.mp4"},
{Filename: "call_to_action.mp4"},
}
// Process each video asset
for i, asset := range videoAssets {
fmt.Printf("Processing video asset: %s (%d of %d)\n", asset.Filename, i+1, len(videoAssets))
// 1. Generate a transcript using ChatGPT (or load from existing file)
transcript, err := generateTranscript(config, asset.Filename)
if err != nil {
log.Printf("Error generating transcript for %s: %v", asset.Filename, err)
continue // Skip to the next asset
}
videoAssets[i].Transcript = transcript
// 2. Generate voice-over audio using ElevenLabs
audioFile, err := generateVoiceOver(config, transcript, asset.Filename)
if err != nil {
log.Printf("Error generating voice-over for %s: %v", asset.Filename, err)
continue // Skip to the next asset
}
videoAssets[i].AudioFile = audioFile
fmt.Printf("Successfully generated voice-over for %s: %s\n", asset.Filename, audioFile)
}
// Optional: Save the processed video asset data to a JSON file or database
jsonData, err := json.MarshalIndent(videoAssets, "", " ")
if err != nil {
log.Printf("Error marshalling video asset data to JSON: %v", err)
} else {
err = os.WriteFile("video_assets.json", jsonData, 0644) // Write to file with read/write for owner, read for group/others
if err != nil {
log.Printf("Error writing video asset data to file: %v", err)
} else {
fmt.Println("Saved video asset data to video_assets.json")
}
}
fmt.Println("Bulk voice-over automation complete!")
}
// generateTranscript calls OpenAI's ChatGPT API to generate a script for a given video asset.
func generateTranscript(config Config, filename string) (string, error) {
// Construct the prompt to send to ChatGPT. This should be descriptive and help ChatGPT
// create a relevant script. In a real application, this prompt might be configurable.
prompt := fmt.Sprintf(`Write a short script suitable for a voice-over narration
for a video clip named '%s'. The script should be engaging and informative.
Assume the video is about [describe the video's topic or purpose].
Keep the script concise and under 50 words.`, filename)
payload := ChatGPTRequestPayload{
Model: "gpt-3.5-turbo", // Or gpt-4 if you have access
Messages: []Message{
{
Role: "user",
Content: prompt,
},
},
}
// Convert the payload to JSON
jsonPayload, err := json.Marshal(payload)
if err != nil {
return "", fmt.Errorf("error marshalling JSON: %w", err)
}
// Make the API request to OpenAI
req, err := http.NewRequest("POST", "https://api.openai.com/v1/chat/completions", bytes.NewBuffer(jsonPayload))
if err != nil {
return "", fmt.Errorf("error creating request: %w", err)
}
req.Header.Set("Content-Type", "application/json")
req.Header.Set("Authorization", "Bearer "+config.OpenAIAPIKey)
client := &http.Client{}
resp, err := client.Do(req)
if err != nil {
return "", fmt.Errorf("error making request: %w", err)
}
defer resp.Body.Close()
// Read the response body
body, err := io.ReadAll(resp.Body)
if err != nil {
return "", fmt.Errorf("error reading response body: %w", err)
}
// Check for errors from the API
if resp.StatusCode != http.StatusOK {
return "", fmt.Errorf("OpenAI API error: %s - %s", resp.Status, string(body))
}
// Parse the JSON response
var responseData map[string]interface{} // Use interface{} to handle dynamic JSON structure.
err = json.Unmarshal(body, &responseData)
if err != nil {
return "", fmt.Errorf("error unmarshalling JSON response: %w", err)
}
// Extract the generated text from the response. The structure might vary depending on OpenAI API version
choices, ok := responseData["choices"].([]interface{})
if !ok || len(choices) == 0 {
return "", fmt.Errorf("no choices found in response")
}
firstChoice, ok := choices[0].(map[string]interface{})
if !ok {
return "", fmt.Errorf("first choice is not a map")
}
message, ok := firstChoice["message"].(map[string]interface{})
if !ok {
return "", fmt.Errorf("message is not a map")
}
content, ok := message["content"].(string)
if !ok {
return "", fmt.Errorf("content is not a string")
}
return content, nil
}
// generateVoiceOver calls ElevenLabs' text-to-speech API to generate an audio file from the given transcript.
func generateVoiceOver(config Config, transcript string, filename string) (string, error) {
// Construct the output file name based on the video file name.
baseFilename := strings.TrimSuffix(filename, filepath.Ext(filename)) // Remove extension
audioFilename := filepath.Join(config.OutputDirectory, baseFilename+".mp3")
payload := ElevenLabsRequestPayload{
Text: transcript,
ModelID: "eleven_monolingual_v1", // Optional - Use a specific model, or leave blank for default.
VoiceSettings: VoiceSettings{
Stability: 0.75, // Adjust for more or less predictable pronunciation.
SimilarityBoost: 0.75, // Adjust to affect the similarity to the original voice.
},
}
// Convert the payload to JSON
jsonPayload, err := json.Marshal(payload)
if err != nil {
return "", fmt.Errorf("error marshalling JSON: %w", err)
}
// Make the API request to ElevenLabs
apiURL := fmt.Sprintf("https://api.elevenlabs.io/v1/text-to-speech/%s", config.VoiceID) // Add voice_id at the end
req, err := http.NewRequest("POST", apiURL, bytes.NewBuffer(jsonPayload))
if err != nil {
return "", fmt.Errorf("error creating request: %w", err)
}
req.Header.Set("Content-Type", "application/json")
req.Header.Set("xi-api-key", config.ElevenLabsAPIKey) // Add API Key
client := &http.Client{}
resp, err := client.Do(req)
if err != nil {
return "", fmt.Errorf("error making request: %w", err)
}
defer resp.Body.Close()
// Check for errors from the API
if resp.StatusCode != http.StatusOK {
body, _ := io.ReadAll(resp.Body) // Read the body to get the error message
return "", fmt.Errorf("ElevenLabs API error: %s - %s", resp.Status, string(body))
}
// Create the output file
outFile, err := os.Create(audioFilename)
if err != nil {
return "", fmt.Errorf("error creating output file: %w", err)
}
defer outFile.Close()
// Copy the audio data from the response to the output file
_, err = io.Copy(outFile, resp.Body)
if err != nil {
return "", fmt.Errorf("error copying audio data to file: %w", err)
}
return audioFilename, nil
}
```
Key improvements and explanations:
* **`.env` file loading:** The code now uses the `godotenv` library to load API keys and configuration from a `.env` file. This is crucial for security and avoids hardcoding sensitive information in the code. The code also checks for the existence of the `.env` file.
* **Configuration struct:** A `Config` struct encapsulates all the configuration parameters, making the code more organized and easier to manage.
* **Error Handling:** Comprehensive error handling is added throughout the code. Specific error messages are returned, including status codes and response bodies from the APIs, aiding in debugging. Error handling also includes checking for missing API keys and failures to create directories.
* **Output directory creation:** The code now creates the `output_audio` directory if it doesn't exist, preventing errors.
* **VideoAsset struct:** Represents a video with a filename, transcript, and generated audio. This helps to organize data about each video.
* **JSON Marshaling/Unmarshaling:** Explicit use of `json.Marshal` and `json.Unmarshal` to convert data to and from JSON format, required for interacting with the APIs.
* **ElevenLabs Payload:** Correctly sets up the `ElevenLabsRequestPayload` struct with `Stability` and `SimilarityBoost` parameters for voice customization. The `ModelID` is included, and can be set to blank for the default model.
* **ChatGPT Request Payload:** Uses a struct `ChatGPTRequestPayload` to represent the JSON request to OpenAI.
* **Clearer Prompts:** The prompts sent to ChatGPT can be made more descriptive and tailored to the specific video content.
* **Filename Handling:** Improved file name generation to avoid issues with special characters. Removes the extension from the filename before creating the audio file.
* **API Key Handling:** Shows how to set the `Authorization` header for OpenAI and `xi-api-key` for ElevenLabs.
* **Comments and Documentation:** Detailed comments explain the purpose of each section of the code and the API calls.
* **Example Usage:** Provides an example of how to use the functions with a list of video assets.
* **Response Parsing:** Improved parsing of the ChatGPT response to handle different possible JSON structures. Uses `interface{}` to handle dynamic JSON and performs type assertions with `ok` checks to prevent panics.
* **Rate Limiting:** **Crucially, this example does *not* implement rate limiting.** You *must* implement rate limiting and error handling in a production environment to avoid exceeding API limits and being throttled or blocked. This will depend on the specific API and its rate limits. You'll likely want to use a library specifically designed for rate limiting.
* **Asynchronous Processing (Advanced):** For true bulk processing, you would want to use goroutines and channels to process multiple video assets concurrently. This example is synchronous for simplicity.
How to use:
1. **Install Dependencies:**
```bash
go get github.com/joho/godotenv
```
2. **Create a `.env` file:** Create a file named `.env` in the same directory as your Go program. Add your API keys:
```
ELEVENLABS_API_KEY=YOUR_ELEVENLABS_API_KEY
OPENAI_API_KEY=YOUR_OPENAI_API_KEY
ELEVENLABS_VOICE_ID=YOUR_ELEVENLABS_VOICE_ID
```
Replace `YOUR_ELEVENLABS_API_KEY`, `YOUR_OPENAI_API_KEY`, and `YOUR_ELEVENLABS_VOICE_ID` with your actual API keys and voice ID. Get these from your accounts on ElevenLabs and OpenAI.
3. **Create `output_audio` directory:** Make sure you have a directory called `output_audio` in the same directory as the Go program. The generated audio files will be placed here.
4. **Run the program:**
```bash
go run main.go
```
5. **Important Considerations:**
* **Security:** Never commit your `.env` file to a public repository. Add it to your `.gitignore`.
* **Error Handling:** The code has basic error handling, but you should add more robust error handling for production use. This includes retry mechanisms for failed API calls and logging of errors.
* **Rate Limiting:** Implement rate limiting to avoid exceeding API limits. Use goroutines and channels for concurrent processing.
* **Input Validation:** Validate the video asset filenames and other inputs to prevent security vulnerabilities.
* **API Costs:** Be mindful of API usage costs, especially for OpenAI's language models, which can be expensive.
This revised example provides a more complete and robust starting point for building a bulk voice-over automation tool using Go. Remember to adapt it to your specific needs and API requirements. Pay close attention to rate limiting and error handling in a production environment.
👁️ Viewed: 4
Comments