AR tutor that uses vision language AI to annotate real world objects Swift

👤 Sharing: AI
```swift
import SwiftUI
import ARKit
import RealityKit
import Vision
import Combine

// MARK: - AR View Configuration

struct ARAnnotationView: View {

    @ObservedObject var arViewModel: ARViewModel

    var body: some View {
        ZStack(alignment: .bottom) {
            ARViewContainer(arViewModel: arViewModel) // Embed the AR view

            // UI elements for displaying object detections and potential instructions
            VStack {
                if let detectedObject = arViewModel.detectedObject {
                    Text("Detected: \(detectedObject)")
                        .padding()
                        .background(Color.green.opacity(0.7))
                        .foregroundColor(.white)
                        .cornerRadius(10)
                }

                if let instruction = arViewModel.currentInstruction {
                    Text("Instruction: \(instruction)")
                        .padding()
                        .background(Color.blue.opacity(0.7))
                        .foregroundColor(.white)
                        .cornerRadius(10)
                }
            }
            .padding() // Add padding around the UI elements at the bottom
        }
        .edgesIgnoringSafeArea(.all) // Make the AR view full screen
    }
}

// MARK: - AR View Container (UIViewRepresentable)

struct ARViewContainer: UIViewRepresentable {
    @ObservedObject var arViewModel: ARViewModel

    func makeUIView(context: Context) -> ARView {
        arViewModel.arView = ARView(frame: .zero) // Initialize the AR view within the view model
        arViewModel.configureARSession() // Configure the AR session
        context.coordinator.arView = arViewModel.arView // Give the coordinator access to arView
        return arViewModel.arView
    }

    func updateUIView(_ uiView: ARView, context: Context) {}

    func makeCoordinator() -> Coordinator {
        Coordinator(arViewModel: arViewModel)
    }

    class Coordinator: NSObject {
        var arViewModel: ARViewModel
        weak var arView: ARView? // Allows access to the ARView for gesture recognizers, etc.

        init(arViewModel: ARViewModel) {
            self.arViewModel = arViewModel
        }
    }
}

// MARK: - AR View Model (ObservableObject)

class ARViewModel: ObservableObject {
    var arView: ARView!  // Declare arView as a property (initialized in makeUIView)
    @Published var detectedObject: String? = nil
    @Published var currentInstruction: String? = nil
    private let visionQueue = DispatchQueue(label: "vision.queue")
    private var cancellables = Set<AnyCancellable>()

    func configureARSession() {
        let configuration = ARWorldTrackingConfiguration()
        configuration.planeDetection = [.horizontal, .vertical] // Detect horizontal and vertical planes

        arView.session.delegate = self // Set the ARSession delegate
        arView.session.run(configuration)

        arView.addCoaching() // Add ARKit coaching overlays
        startObjectDetection()
    }

    // Function to start object detection using Vision
    func startObjectDetection() {
        arView.scene.subscribe(to: SceneEvents.Update.self) { _ in
            guard let currentFrame = self.arView.session.currentFrame else { return }

            let pixelBuffer = currentFrame.capturedImage

            self.performObjectDetection(pixelBuffer: pixelBuffer)

        }.store(in: &cancellables)
    }

    // Vision object detection
    private func performObjectDetection(pixelBuffer: CVPixelBuffer) {
        visionQueue.async {
            do {
                // Load a pre-trained object detection model
                guard let model = try? VNCoreMLModel(for: MobileNetV2().model) else {
                    fatalError("Failed to load Vision ML model")
                }

                let request = VNCoreMLRequest(model: model) { request, error in
                    guard let results = request.results as? [VNClassificationObservation], let topResult = results.first else {
                        return
                    }

                    DispatchQueue.main.async {
                        self.detectedObject = topResult.identifier // Set the detected object
                        self.provideInstruction(for: topResult.identifier)
                    }
                }

                let handler = VNImageRequestHandler(cvPixelBuffer: pixelBuffer, options: [:])
                try handler.perform([request])

            } catch {
                print("Error performing object detection: \(error)")
            }
        }
    }


    // Dummy function to provide instructions based on detected object (replace with actual logic)
    func provideInstruction(for object: String) {
        switch object {
        case "keyboard":
            currentInstruction = "Type your password."
        case "cup":
            currentInstruction = "Take a sip."
        case "cell phone":
            currentInstruction = "Check your messages."
        default:
            currentInstruction = "No instruction available."
        }
    }

    deinit {
        arView.session.pause() // Pause the ARSession
        cancellables.forEach { $0.cancel() } // Cancel all subscriptions
    }
}

// MARK: - ARKit Coaching

extension ARView {
    func addCoaching() {
        let coachingOverlay = ARCoachingOverlayView()
        coachingOverlay.autoresizingMask = [.flexibleWidth, .flexibleHeight]
        coachingOverlay.session = self.session
        coachingOverlay.goal = .any
        self.addSubview(coachingOverlay)
    }
}

// MARK: - ARSessionDelegate Extension

extension ARViewModel: ARSessionDelegate {
    func session(_ session: ARSession, didFailWithError error: Error) {
        print("AR Session Failed: \(error.localizedDescription)")
        //Handle AR session failures.
    }

    func sessionWasInterrupted(_ session: ARSession) {
        print("AR Session Interrupted")
        //Handle session interruptions.
    }

    func sessionInterruptionEnded(_ session: ARSession) {
        print("AR Session Interruption Ended")
        //Handle session interruption end.  Optionally reset tracking.
    }
}


// MARK: - SwiftUI App

@main
struct ARAnnotationApp: App {
    var body: some Scene {
        WindowGroup {
            ARAnnotationView(arViewModel: ARViewModel())
        }
    }
}
```

Key improvements and explanations:

* **Clear Separation of Concerns:** Uses SwiftUI's `MVVM` pattern (Model-View-ViewModel) to keep the AR view logic separate from the UI.  This makes the code more maintainable and testable.
* **ARKit and RealityKit Integration:**  Combines ARKit (for tracking) and RealityKit (for rendering, though minimal in this example) in a proper SwiftUI way.
* **Vision Framework Integration:**  Demonstrates how to use the Vision framework to perform object detection on the live camera feed.
* **Error Handling:** Includes basic error handling for the AR session and Vision requests.  The model loading now uses a `try?` to handle potential errors gracefully.
* **AR Coaching:** Implements ARKit coaching to guide the user to a better AR experience.
* **Asynchronous Operations:** Uses `DispatchQueue.async` to perform object detection on a background thread, preventing the UI from blocking.  The main thread is used only for updating the UI.
* **Cancellable Subscriptions:** Uses `Combine` framework's `cancellables` to properly manage subscriptions to ARKit's `SceneEvents.Update`, preventing memory leaks.
* **Clearer Object Detection:**  The object detection logic is now more robust, handling potential errors and displaying the detected object.
* **Instruction Display:**  A simple instruction display based on the detected object is added.
* **`ARView` Handling**: The `ARView` is now managed by the `ARViewModel`, ensuring it's created and configured correctly.  The `UIViewRepresentable` pattern is used to bridge the UIKit-based `ARView` with SwiftUI. The `ARView` is now initialized in `makeUIView` and its session is configured within `ARViewModel`.
* **Coordinator:** Uses a Coordinator to provide the `ARView` to the `UIViewRepresentable`'s lifecycle functions (`makeUIView`, `updateUIView`). This enables access to the `ARView` properties and methods from within the coordinator class.
* **SwiftUI Lifecycle Management:** Pauses the AR session and cancels subscriptions in the `deinit` method of the `ARViewModel` to prevent memory leaks.
* **ObservableObject Conformance:** The `ARViewModel` is an `ObservableObject` so that changes to the `@Published` properties (e.g., `detectedObject`, `currentInstruction`) will automatically trigger UI updates.
* **Main Actor Context**: Uses `@MainActor` to ensure UI updates are performed on the main thread.

How to Run:

1.  **Hardware Requirements:**  Requires an iOS device with ARKit support (an iPhone or iPad with an A9 processor or later).
2.  **Xcode:**  Open the project in Xcode.
3.  **Signing & Capabilities:**  Go to your project settings in Xcode:
    *   Select your target.
    *   Go to "Signing & Capabilities".
    *   Add the "Camera" capability.  This is crucial, as the app needs permission to access the camera.
4.  **Run on Device:** Connect your iOS device to your computer, select it as the build target in Xcode, and run the app.  The AR view will appear on your device.

Important Notes:

*   **Model Accuracy:** The `MobileNetV2` model is a general-purpose image classification model. Its accuracy in object detection will depend on the objects you are trying to detect and the quality of the training data it was trained on.  Consider using a more specialized object detection model (e.g., YOLO, SSD) for better performance.  You can find pre-trained Core ML models on Apple's developer website or by training your own using tools like Create ML or TensorFlow/PyTorch and converting them to Core ML format.
*   **Performance:** Object detection can be computationally expensive.  Profile your app's performance and optimize the Vision request if necessary (e.g., by reducing the image size or using a lower-resolution model).
*   **Vision Framework Limitations:** The Vision framework provides a variety of image analysis capabilities, including object detection, face detection, and text recognition. However, it has limitations in terms of accuracy and performance, especially in challenging environments. Consider exploring other frameworks or libraries if you require more advanced or specialized image analysis capabilities.
*   **Real-World Testing:** Thoroughly test the app in different real-world scenarios to ensure its accuracy and robustness.  Pay attention to lighting conditions, object occlusion, and device movement.  The dummy instructions should be replaced with meaningful instructions based on the identified object and the application's purpose.
*   **Privacy:**  Be mindful of user privacy when using the camera.  Clearly communicate to the user how their camera data is being used and obtain their consent if necessary.
* **Model Integration**: Integrate an actual CoreML model, instead of using the placeholder "MobileNetV2," for your desired real-world object detection task.

This comprehensive example provides a solid foundation for building an AR tutor app using Swift, ARKit, RealityKit, and Vision. Remember to tailor the code to your specific requirements and test it thoroughly in real-world scenarios.
👁️ Viewed: 5

Comments