Skip to content

Commit

Permalink
Adds a LlmInference.Metrics for providing some key performance metric…
Browse files Browse the repository at this point in the history
…s ( initialization time, response generation time) of the LLM inference.

PiperOrigin-RevId: 683258108
  • Loading branch information
yishuangP authored and copybara-github committed Oct 7, 2024
1 parent 876bdd5 commit eafcf31
Show file tree
Hide file tree
Showing 2 changed files with 74 additions and 6 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -27,12 +27,19 @@ extension LlmInference {
///
/// Note: Inherits from `NSObject` for Objective-C interoperability.
@objc(MPPLLMInferenceSession) public final class Session: NSObject {
// Session runner that manages the creation, deletion and execution of the underlying C session.
/// Provides key metrics including response generation time.
public private(set) var metrics: Metrics

/// Session runner that manages the creation, deletion and execution of the underlying C
/// session.
private let llmSessionRunner: LlmSessionRunner

// LLM Inference used to create this session.
/// LLM Inference used to create this session.
private let llmInference: LlmInference

/// The start time of the current response generation.
private var responseGenerationStartTime = TimeInterval.zero

/// Creates a new instance of `LlmInference.Session` with the given options and `llmInference`.
/// Note: This class maintains a strong reference to `llmInference`. `llmInference` will
/// only get deallocated after all sessions created using the `llmInference` get destroyed.
Expand Down Expand Up @@ -60,6 +67,7 @@ extension LlmInference {
} ?? llmInference.createSessionRunner(sessionConfig: sessionConfig)

self.llmInference = llmInference
self.metrics = Metrics(responseGenerationTimeInMillis: 0)
super.init()
}

Expand Down Expand Up @@ -89,6 +97,7 @@ extension LlmInference {
init(llmSessionRunner: LlmSessionRunner, llmInference: LlmInference) {
self.llmSessionRunner = llmSessionRunner
self.llmInference = llmInference
self.metrics = Metrics(responseGenerationTimeInMillis: 0)
super.init()
}

Expand Down Expand Up @@ -117,10 +126,10 @@ extension LlmInference {
///
/// TODO: If simultaneous response generations on multiple sessions or the same session
/// are allowed to happen it leads to a crash. Investigate if this can be handled by C++.
try llmInference.shouldContinueWithResponseGeneration()
try shouldContinueWithResponseGeneration()

defer {
llmInference.markResponseGenerationCompleted()
markResponseGenerationCompleted()
}

let tokens = try llmSessionRunner.predict()
Expand Down Expand Up @@ -156,7 +165,7 @@ extension LlmInference {
///
/// TODO: If simultaneous response generations on multiple sessions or the same session
/// are allowed to happen it leads to a crash. Investigate if this can be handled by C++.
try llmInference.shouldContinueWithResponseGeneration()
try shouldContinueWithResponseGeneration()

/// Used to make a decision about whitespace stripping.
var receivedFirstToken = true
Expand All @@ -177,7 +186,7 @@ extension LlmInference {
progress(humanReadableLlmResponse, nil)
},
completion: { [weak self] in
self?.llmInference.markResponseGenerationCompleted()
self?.markResponseGenerationCompleted()
completion()
})
}
Expand Down Expand Up @@ -238,6 +247,20 @@ extension LlmInference {
return Session(llmSessionRunner: clonedSessionRunner, llmInference: self.llmInference)
}

private func shouldContinueWithResponseGeneration() throws {
try llmInference.shouldContinueWithResponseGeneration()
responseGenerationStartTime = TimeInterval(clock_gettime_nsec_np(CLOCK_MONOTONIC_RAW))
}

/// Marks the response generation as completed and updates the metrics.
private func markResponseGenerationCompleted() {
llmInference.markResponseGenerationCompleted()
metrics = Metrics(
responseGenerationTimeInMillis: (TimeInterval(clock_gettime_nsec_np(CLOCK_MONOTONIC_RAW))
- responseGenerationStartTime) * 1000
/ TimeInterval(NSEC_PER_SEC))
}

private static func humanReadableString(
llmResponses: [String], stripLeadingWhitespaces: Bool = true
) -> String? {
Expand Down Expand Up @@ -280,6 +303,24 @@ extension LlmInference.Session {
}
}

/// Extension to `LlmInference.Session` for defining `LlmInference.Session.Metrics`
extension LlmInference.Session {
/// Provides some key metrics for the `LlmInference.Session`.
///
/// Note: Inherits from `NSObject` for Objective C interoperability.
@objc(MPPLLMInferenceSessionMetrics) public final class Metrics: NSObject {

/// The time it took to generate the full response for last query, in milliseconds.
@objc public private(set) var responseGenerationTimeInMillis: TimeInterval

@objc public init(
responseGenerationTimeInMillis: TimeInterval
) {
self.responseGenerationTimeInMillis = responseGenerationTimeInMillis
}
}
}

/// An extension to `String` to add some utility functions.
extension String {
private static let tokenSplitter = ""
Expand Down
27 changes: 27 additions & 0 deletions mediapipe/tasks/ios/genai/inference/sources/LlmInference.swift
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,18 @@ import MediaPipeTasksGenAIC
/// out of scope if at least one of its sessions outlives its scope.
///
/// Note: Inherits from `NSObject` for Objective C interoperability.
///
/// Note: Initializing an LLM inference engine is an expensive operation. Avoid initializing it on
/// the main thread.
@objc(MPPLLMInference) public final class LlmInference: NSObject {
private static let numberOfDecodeStepsPerSync = 3
private static let sequenceBatchSize = 0
private static let responseGenerationInProgressQueueName =
"com.google.mediapipe.genai.isResponseGenerationInProgressQueue"

/// Provides key metrics including initialization duration.
public private(set) var metrics: Metrics

private let llmTaskRunner: LlmTaskRunner

/// Serial queue that reads and updates `responseGenerationInProgress` to restrict simultaneous
Expand All @@ -53,6 +59,7 @@ import MediaPipeTasksGenAIC

let sequenceBatchSize = LlmInference.sequenceBatchSize
let numberOfDecodeStepsPerSync = LlmInference.numberOfDecodeStepsPerSync
let timeBeforeInit = clock_gettime_nsec_np(CLOCK_MONOTONIC_RAW)
llmTaskRunner = try options.modelPath.withCString { modelPath in
try cacheDirectory.withCString { cacheDirectory in
try options.supportedLoraRanks.withUnsafeMutableBufferPointer { supportedLoraRanks in
Expand All @@ -72,6 +79,10 @@ import MediaPipeTasksGenAIC
}
}
}
let timeAfterInit = clock_gettime_nsec_np(CLOCK_MONOTONIC_RAW)
metrics = Metrics(
initializationTimeInMillis: (TimeInterval(timeAfterInit - timeBeforeInit) * 1000)
/ TimeInterval(NSEC_PER_SEC))

super.init()
}
Expand Down Expand Up @@ -270,3 +281,19 @@ extension LlmInference.ActivationDataType {
}
}
}

extension LlmInference {
/// Provides some key metrics for the `LlmInference`.
///
/// Note: Inherits from `NSObject` for Objective C interoperability.
@objc(MPPLLMInferenceMetrics) public final class Metrics: NSObject {
/// The time it took to initialize the LLM inference engine, in milliseconds.
/// If you want to include the time it took to load the model weights, set
/// `LlmInference.Options.waitForWeightUploads` to true.
@objc public private(set) var initializationTimeInMillis: TimeInterval

@objc public init(initializationTimeInMillis: TimeInterval) {
self.initializationTimeInMillis = initializationTimeInMillis
}
}
}

0 comments on commit eafcf31

Please sign in to comment.