Adds a LlmInference.Metrics for providing some key performance metric…

…s ( initialization time, response generation time) of the LLM inference. PiperOrigin-RevId: 683258108
google-ai-edge · Oct 7, 2024 · eafcf31 · eafcf31
1 parent 876bdd5
commit eafcf31
Show file tree

Hide file tree

Showing 2 changed files with 74 additions and 6 deletions.
diff --git a/mediapipe/tasks/ios/genai/inference/sources/LlmInference+Session.swift b/mediapipe/tasks/ios/genai/inference/sources/LlmInference+Session.swift
@@ -27,12 +27,19 @@ extension LlmInference {
   ///
   /// Note: Inherits from `NSObject` for Objective-C interoperability.
   @objc(MPPLLMInferenceSession) public final class Session: NSObject {
-    // Session runner that manages the creation, deletion and execution of the underlying C session.
+    /// Provides key metrics including response generation time.
+    public private(set) var metrics: Metrics
+
+    /// Session runner that manages the creation, deletion and execution of the underlying C
+    /// session.
     private let llmSessionRunner: LlmSessionRunner
 
-    // LLM Inference used to create this session.
+    /// LLM Inference used to create this session.
     private let llmInference: LlmInference
 
+    /// The start time of the current response generation.
+    private var responseGenerationStartTime = TimeInterval.zero
+
     /// Creates a new instance of `LlmInference.Session` with the given options and `llmInference`.
     /// Note: This class maintains a strong reference to `llmInference`. `llmInference` will
     /// only get deallocated after all sessions created using the `llmInference` get destroyed.
@@ -60,6 +67,7 @@ extension LlmInference {
         } ?? llmInference.createSessionRunner(sessionConfig: sessionConfig)
 
       self.llmInference = llmInference
+      self.metrics = Metrics(responseGenerationTimeInMillis: 0)
       super.init()
     }
 
@@ -89,6 +97,7 @@ extension LlmInference {
     init(llmSessionRunner: LlmSessionRunner, llmInference: LlmInference) {
       self.llmSessionRunner = llmSessionRunner
       self.llmInference = llmInference
+      self.metrics = Metrics(responseGenerationTimeInMillis: 0)
       super.init()
     }
 
@@ -117,10 +126,10 @@ extension LlmInference {
       ///
       /// TODO: If simultaneous response generations on multiple sessions or the same session
       /// are allowed to happen it leads to a crash. Investigate if this can be handled by C++.
-      try llmInference.shouldContinueWithResponseGeneration()
+      try shouldContinueWithResponseGeneration()
 
       defer {
-        llmInference.markResponseGenerationCompleted()
+        markResponseGenerationCompleted()
       }
 
       let tokens = try llmSessionRunner.predict()
@@ -156,7 +165,7 @@ extension LlmInference {
       ///
       /// TODO: If simultaneous response generations on multiple sessions or the same session
       /// are allowed to happen it leads to a crash. Investigate if this can be handled by C++.
-      try llmInference.shouldContinueWithResponseGeneration()
+      try shouldContinueWithResponseGeneration()
 
       /// Used to make a decision about whitespace stripping.
       var receivedFirstToken = true
@@ -177,7 +186,7 @@ extension LlmInference {
           progress(humanReadableLlmResponse, nil)
         },
         completion: { [weak self] in
-          self?.llmInference.markResponseGenerationCompleted()
+          self?.markResponseGenerationCompleted()
           completion()
         })
     }
@@ -238,6 +247,20 @@ extension LlmInference {
       return Session(llmSessionRunner: clonedSessionRunner, llmInference: self.llmInference)
     }
 
+    private func shouldContinueWithResponseGeneration() throws {
+      try llmInference.shouldContinueWithResponseGeneration()
+      responseGenerationStartTime = TimeInterval(clock_gettime_nsec_np(CLOCK_MONOTONIC_RAW))
+    }
+
+    /// Marks the response generation as completed and updates the metrics.
+    private func markResponseGenerationCompleted() {
+      llmInference.markResponseGenerationCompleted()
+      metrics = Metrics(
+        responseGenerationTimeInMillis: (TimeInterval(clock_gettime_nsec_np(CLOCK_MONOTONIC_RAW))
+          - responseGenerationStartTime) * 1000
+          / TimeInterval(NSEC_PER_SEC))
+    }
+
     private static func humanReadableString(
       llmResponses: [String], stripLeadingWhitespaces: Bool = true
     ) -> String? {
@@ -280,6 +303,24 @@ extension LlmInference.Session {
   }
 }
 
+/// Extension to `LlmInference.Session` for defining `LlmInference.Session.Metrics`
+extension LlmInference.Session {
+  /// Provides some key metrics for the `LlmInference.Session`.
+  ///
+  /// Note: Inherits from `NSObject` for Objective C interoperability.
+  @objc(MPPLLMInferenceSessionMetrics) public final class Metrics: NSObject {
+
+    /// The time it took to generate the full response for last query, in milliseconds.
+    @objc public private(set) var responseGenerationTimeInMillis: TimeInterval
+
+    @objc public init(
+      responseGenerationTimeInMillis: TimeInterval
+    ) {
+      self.responseGenerationTimeInMillis = responseGenerationTimeInMillis
+    }
+  }
+}
+
 /// An extension to `String` to add some utility functions.
 extension String {
   private static let tokenSplitter = "▁"

diff --git a/mediapipe/tasks/ios/genai/inference/sources/LlmInference.swift b/mediapipe/tasks/ios/genai/inference/sources/LlmInference.swift
@@ -22,12 +22,18 @@ import MediaPipeTasksGenAIC
 /// out of scope if at least one of its sessions outlives its scope.
 ///
 /// Note: Inherits from `NSObject` for Objective C interoperability.
+///
+/// Note: Initializing an LLM inference engine is an expensive operation. Avoid initializing it on
+/// the main thread.
 @objc(MPPLLMInference) public final class LlmInference: NSObject {
   private static let numberOfDecodeStepsPerSync = 3
   private static let sequenceBatchSize = 0
   private static let responseGenerationInProgressQueueName =
     "com.google.mediapipe.genai.isResponseGenerationInProgressQueue"
 
+  /// Provides key metrics including initialization duration.
+  public private(set) var metrics: Metrics
+
   private let llmTaskRunner: LlmTaskRunner
 
   /// Serial queue that reads and updates `responseGenerationInProgress` to restrict simultaneous
@@ -53,6 +59,7 @@ import MediaPipeTasksGenAIC
 
     let sequenceBatchSize = LlmInference.sequenceBatchSize
     let numberOfDecodeStepsPerSync = LlmInference.numberOfDecodeStepsPerSync
+    let timeBeforeInit = clock_gettime_nsec_np(CLOCK_MONOTONIC_RAW)
     llmTaskRunner = try options.modelPath.withCString { modelPath in
       try cacheDirectory.withCString { cacheDirectory in
         try options.supportedLoraRanks.withUnsafeMutableBufferPointer { supportedLoraRanks in
@@ -72,6 +79,10 @@ import MediaPipeTasksGenAIC
         }
       }
     }
+    let timeAfterInit = clock_gettime_nsec_np(CLOCK_MONOTONIC_RAW)
+    metrics = Metrics(
+      initializationTimeInMillis: (TimeInterval(timeAfterInit - timeBeforeInit) * 1000)
+        / TimeInterval(NSEC_PER_SEC))
 
     super.init()
   }
@@ -270,3 +281,19 @@ extension LlmInference.ActivationDataType {
     }
   }
 }
+
+extension LlmInference {
+  /// Provides some key metrics for the `LlmInference`.
+  ///
+  /// Note: Inherits from `NSObject` for Objective C interoperability.
+  @objc(MPPLLMInferenceMetrics) public final class Metrics: NSObject {
+    /// The time it took to initialize the LLM inference engine, in milliseconds.
+    /// If you want to include the time it took to load the model weights, set
+    /// `LlmInference.Options.waitForWeightUploads` to true.
+    @objc public private(set) var initializationTimeInMillis: TimeInterval
+
+    @objc public init(initializationTimeInMillis: TimeInterval) {
+      self.initializationTimeInMillis = initializationTimeInMillis
+    }
+  }
+}