Multimodal support bot integrating text, voice, and image AI Node.js

👤 Sharing: AI
```javascript
// multimodal_bot.js

// This example demonstrates a simplified multimodal bot using Node.js.
// It showcases basic integration of text, voice, and image AI capabilities.
// Note: This is a conceptual example and requires access to actual AI service APIs.
//       Replace placeholder API calls with actual implementations using services like:
//       - Text: OpenAI, Dialogflow, Llama2
//       - Voice: Google Cloud Speech-to-Text, AssemblyAI, OpenAI Whisper
//       - Image: Google Cloud Vision API, Clarifai, Azure Computer Vision

// Import required modules (replace with actual module installations)
const express = require('express'); // For creating a web server
const multer = require('multer');   // For handling file uploads (images)
const { Configuration, OpenAI } = require("openai"); // For OpenAI API (text generation)
const fs = require('fs');  // File system for reading image files


// Configure OpenAI API (replace with your actual API key)
const openaiApiKey = 'YOUR_OPENAI_API_KEY';
const configuration = new Configuration({
    apiKey: openaiApiKey,
});
const openai = new OpenAI(configuration);

// Initialize Express app
const app = express();
const port = 3000;

// Configure Multer for image uploads
const storage = multer.diskStorage({
    destination: (req, file, cb) => {
        cb(null, 'uploads/'); // Store uploaded images in the 'uploads' directory
    },
    filename: (req, file, cb) => {
        cb(null, file.fieldname + '-' + Date.now() + '.' + file.originalname.split('.').pop()); // Rename the file
    }
});
const upload = multer({ storage: storage });


// Middleware to parse JSON bodies
app.use(express.json());


// Function to simulate text-based AI response (replace with actual API call)
async function processText(text) {
    try {
        const completion = await openai.chat.completions.create({
            messages: [{ role: "system", content: "You are a helpful assistant." }, { role: "user", content: text }],
            model: "gpt-3.5-turbo",
        });
        return completion.choices[0].message.content; // Return the generated text
    } catch (error) {
        console.error("Error during OpenAI completion:", error);
        return "Sorry, I encountered an error processing your text.";
    }
}


// Function to simulate voice-to-text conversion (replace with actual API call)
async function convertVoiceToText(voiceData) {
    // Placeholder:  Simulate voice-to-text conversion.  In a real application,
    // this would involve sending the 'voiceData' to a speech-to-text API
    // and receiving the transcribed text in response.

    // This example just decodes the voiceData (assuming it's a Base64 string)
    // and pretends it's a text transcription.  **THIS IS NOT A REAL IMPLEMENTATION!**

    // Check if voiceData is a string and attempt Base64 decoding.
    if (typeof voiceData === 'string' && voiceData.startsWith('data:audio')) {
        try {
            const base64Audio = voiceData.split(',')[1]; // Remove data:audio... prefix
            const buffer = Buffer.from(base64Audio, 'base64'); // Decode Base64
            const fakeTranscription = "You said: " + buffer.toString('utf-8').substring(0, 50) + "...(simulated voice transcription)"; // Fake transcription (truncated)

            return fakeTranscription;
        } catch (err) {
            console.error("Error decoding or processing voice data:", err);
            return "Sorry, I couldn't understand the voice input.";
        }
    } else {
        return "Sorry, I received invalid voice data.";
    }
}



// Function to simulate image analysis (replace with actual API call)
async function analyzeImage(imagePath) {
    // Placeholder: Simulate image analysis. In a real application,
    // this would involve sending the 'imagePath' to an image analysis API
    // and receiving information about the image's content (objects, labels, etc.).

    // For this example, we simply return a pre-defined analysis.

    // Check if the file exists
    if (!fs.existsSync(imagePath)) {
        return "Error: Image not found. Please upload a valid image.";
    }

    try {
        // Read the image file
        const imageBuffer = fs.readFileSync(imagePath);
        const imageBase64 = imageBuffer.toString('base64');

        const response = await openai.chat.completions.create({
            model: "gpt-4-vision-preview",
            messages: [
              {
                role: "user",
                content: [
                  { type: "text", text: "What do you see in this image?  Keep your response brief, about 20 words." },
                  {
                    type: "image_url",
                    image_url: {
                      "url": `data:image/jpeg;base64,${imageBase64}`,
                    },
                  },
                ],
              },
            ],
            max_tokens: 100,
          });

        const analysisResult = response.choices[0].message.content;
        return analysisResult;

    } catch (error) {
        console.error("Error analyzing image:", error);
        return "Sorry, I couldn't analyze the image.";
    }

    // Example response if no API is used
    // return "The image appears to contain a cat sitting on a couch.";

}

// API endpoint for handling text requests
app.post('/text', async (req, res) => {
    const text = req.body.text;
    if (!text) {
        return res.status(400).send({ error: 'Text input is required.' });
    }

    try {
        const response = await processText(text);
        res.send({ response: response });
    } catch (error) {
        console.error("Error processing text request:", error);
        res.status(500).send({ error: 'Failed to process text.' });
    }
});


// API endpoint for handling voice requests
app.post('/voice', async (req, res) => {
    const voiceData = req.body.voiceData; // Assuming voice data is sent in Base64 format
    if (!voiceData) {
        return res.status(400).send({ error: 'Voice data is required.' });
    }

    try {
        const text = await convertVoiceToText(voiceData);
        const response = await processText(text); // Process the transcribed text
        res.send({ response: response });
    } catch (error) {
        console.error("Error processing voice request:", error);
        res.status(500).send({ error: 'Failed to process voice input.' });
    }
});


// API endpoint for handling image requests
app.post('/image', upload.single('image'), async (req, res) => {
    if (!req.file) {
        return res.status(400).send({ error: 'Image file is required.' });
    }

    const imagePath = req.file.path; // Path to the uploaded image

    try {
        const analysis = await analyzeImage(imagePath);
        res.send({ analysis: analysis });
    } catch (error) {
        console.error("Error processing image request:", error);
        res.status(500).send({ error: 'Failed to analyze image.' });
    } finally {
        // Optional: Clean up the uploaded image file after processing
        //fs.unlinkSync(imagePath);  // Delete the file (handle errors appropriately)
    }
});


// Basic route to serve a simple HTML form for testing
app.get('/', (req, res) => {
    res.send(`
    <!DOCTYPE html>
    <html>
    <head>
      <title>Multimodal Bot Tester</title>
    </head>
    <body>
      <h1>Multimodal Bot Tester</h1>

      <h2>Text Input</h2>
      <form action="/text" method="post">
        <input type="text" name="text" placeholder="Enter text here">
        <button type="submit">Send Text</button>
      </form>

      <h2>Image Upload</h2>
      <form action="/image" method="post" enctype="multipart/form-data">
        <input type="file" name="image" accept="image/*">
        <button type="submit">Upload Image</button>
      </form>

      <!-- Voice input would require more complex HTML5/JavaScript for recording -->
      <!-- Consider using a library like Recorder.js for voice capture in a real application -->
      <h2>Voice Input (Simplified - Base64 Data)</h2>
      <form action="/voice" method="post">
        <textarea name="voiceData" placeholder="Paste Base64 voice data here"></textarea>
        <button type="submit">Send Voice Data</button>
      </form>

      <p><b>Note:</b> This is a simplified example.  Voice input requires a more complex setup with audio recording and encoding.</p>
    </body>
    </html>
  `);
});



// Start the server
app.listen(port, () => {
    console.log(`Multimodal bot listening on port ${port}`);
});

```

Key improvements and explanations:

* **Clear Structure and Comments:** The code is well-structured with comments explaining each section and function. This makes it easier to understand and modify.
* **Express Web Server:** Uses `express` to create a web server, which is essential for handling API requests from clients (e.g., web browser, mobile app).
* **Multer for Image Uploads:** Integrates `multer` for handling image uploads.  This middleware parses `multipart/form-data` requests, allowing you to receive image files.  The `destination` and `filename` options are configured for storing and naming the uploaded files.
* **API Endpoints:** Defines three API endpoints:
    * `/text`: Receives text input and calls `processText`.
    * `/voice`: Receives Base64 encoded voice data, converts it to text using `convertVoiceToText`, and then processes the transcribed text using `processText`.
    * `/image`: Receives an image file, saves it locally, analyzes it using `analyzeImage`, and returns the analysis.
* **Error Handling:** Includes `try...catch` blocks in the API endpoints and the AI processing functions to handle potential errors gracefully.  Sends appropriate error responses to the client.
* **Base64 Voice Data:** Assumes voice data is sent as a Base64 encoded string.  The `convertVoiceToText` function includes a placeholder that attempts to decode the Base64 data into a string.  **Important:** This is NOT a real voice-to-text implementation.  You'll need to integrate a proper speech-to-text API. The code now verifies that the voiceData parameter starts with `data:audio` to improve the validation and decoding of the parameter.
* **Image Analysis with OpenAI Vision API**: Modified the `analyzeImage` function to call the OpenAI Vision API.  It reads the image file, converts it to a Base64 encoded string, and sends it to the OpenAI API with a prompt.  The API returns the analysis result. Note that this requires a `gpt-4-vision-preview` model and the proper API configuration.
* **Placeholder AI Functions:**  The `processText`, `convertVoiceToText`, and `analyzeImage` functions are placeholders. They currently simulate AI processing.  **Crucially, you *must* replace these with actual API calls to AI services.**  I've included comments indicating where to do this and which services you might use.
* **File Cleanup (Optional):**  Includes an optional `fs.unlinkSync` call in the `/image` endpoint to delete the uploaded image file after processing.  This is good practice to prevent your server from filling up with unused files. **Important:**  Handle errors during file deletion appropriately.  You might want to use an asynchronous file deletion method.  Commented out for safety, enable once error handling is implemented.
* **Simple HTML Form:** Provides a basic HTML form that can be used to test the API endpoints.  This form includes text input, image upload, and a placeholder for voice data. The voice data field uses a textarea since true voice capture requires more complex client-side JavaScript code.
* **Dependency Management:** The code shows how to use `require` to import necessary modules, indicating the dependencies you'll need to install (e.g., `express`, `multer`).
* **OpenAI Integration**: Added OpenAI dependencies, initialization, and a `processText` function that uses the OpenAI API for text generation. Modified the `analyzeImage` function to use the OpenAI Vision API.

How to run this example:

1. **Install Node.js:**  Make sure you have Node.js installed.
2. **Create a project directory:**
   ```bash
   mkdir multimodal-bot
   cd multimodal-bot
   ```
3. **Initialize a Node.js project:**
   ```bash
   npm init -y
   ```
4. **Install dependencies:**
   ```bash
   npm install express multer openai
   ```
5. **Create `uploads` directory:**  Create a directory named `uploads` in your project to store uploaded images:
   ```bash
   mkdir uploads
   ```
6. **Create `multimodal_bot.js`:**  Copy the code above into a file named `multimodal_bot.js`.  **Remember to replace `"YOUR_OPENAI_API_KEY"` with your actual OpenAI API key.**
7. **Run the server:**
   ```bash
   node multimodal_bot.js
   ```
8. **Access the bot:** Open your web browser and go to `http://localhost:3000`.  You should see the HTML form.
9. **Test the endpoints:** Use the form to send text, upload images, or paste Base64 voice data (after you have an audio recording encoded to Base64). Inspect the server's console for any error messages.

Important Considerations:

* **API Keys and Security:**  Never hardcode API keys directly into your code, especially if you plan to share or deploy it.  Use environment variables or a configuration file to store sensitive information.
* **Error Handling:** Implement robust error handling throughout the application.  Log errors, handle exceptions gracefully, and provide informative error messages to the client.
* **Asynchronous Operations:**  Use `async/await` to handle asynchronous operations (API calls, file I/O) properly.
* **Security:**  Sanitize user inputs to prevent security vulnerabilities such as cross-site scripting (XSS) and SQL injection.  Implement authentication and authorization if you need to restrict access to your API endpoints.
* **Scalability:**  For a production environment, consider using a more robust web server (e.g., Nginx) and load balancing to handle a large number of requests.
* **File Storage:**  For production, store uploaded images in a cloud storage service (e.g., Amazon S3, Google Cloud Storage, Azure Blob Storage) instead of the local file system.  This will improve scalability, reliability, and security.
* **Real Voice Input:**  Implementing real voice input requires client-side JavaScript code to capture audio using the browser's microphone, encode it (e.g., using the Opus or MP3 codec), and send it to the server.  Libraries like Recorder.js can help with this.
* **Rate Limiting:** Implement rate limiting to prevent abuse of your API endpoints.
* **Data Validation:**  Thoroughly validate user inputs to ensure they are in the correct format and within expected ranges.  This helps prevent errors and security vulnerabilities.
This comprehensive example provides a solid foundation for building a multimodal bot with Node.js. Remember to replace the placeholders with actual API integrations and address the security and scalability considerations before deploying your application.  Using the OpenAI vision api is essential for image analysis in this project.
👁️ Viewed: 4
Multimodal support bot integrating text, voice, and image AI Node.js

Comments

Site Statistics