webcodecs.idl

// WARNING: This is just some ideas of how it could look.  Don't take it too seriously yet.

// TODO(when writing spec):
// - Specify that encoding and decoding must happen off the main thread.

// Common definitions used for both audio and video.

[Constructor(unsigned long long value, unsigned long long scale)]
interface MediaTime {
  readonly attribute unsigned long long value; 
  readonly attribute unsigned long long scale;
}


// Audio encoder and decoder interfaces.

dictionary AudioCodecParameters {
  DOMString codec;

  // Defaults are codec-specific
  unsigned long? sampleRate;
  unsigned long? channelCount;
}

[Constructor(AudioDecoderInit init)]
interface AudioDecoder {
  readonly attribute WritableStream writable;  // AudioDecoderInput
  readonly attribute ReadableStream readable;  // AudioDecoderOutput
}

[Constructor(AudioEncoderInit init)]
interface AudioEncoder {
  readonly attribute WritableStream writable;  // AudioEncoderInput
  readonly attribute ReadableStream readable;  // AudioEncoderOutput
}

dictionary AudioDecoderInit : AudioCodecParameters {
  // Optional byte data required to initialize audio decoders
  // such as Vorbis codebooks.
  BufferSource? extraData;
  // Duration decoder must decode before the decoded data is valid
  MediaTime? seekPreRoll;
  // Duration decoder should discard before returning decoded data.     
  // Can include both decoder delay as well as padding added during    
  // encoding.
  MediaTime? codecDelay;
} 

dictionary AudioDecoderInput {
  Uint8Array data;
  MediaTime timestamp;
}

dictionary AudioDecoderOutput {
  AudioBuffer buffer;
  // TODO: decode stats.
}

dictionary AudioEncoderStaticSettings {
}

dictionary AudioEncoderDynamicSettings {
  // not supported by all codecs
  // null/unset means use the codec default
  unsigned long? bitsPerSecond;  

  // codec-specific
  // null/unset means use the codec default
  unsigned long? complexity;

  // probably opus-specific
  bool fec = false;  // enabled or not
  bool dtx = false;  // enabled or not
  bool cbr = false;  // cbr or not (vbr if not)
  bool speechMode = false;  // speech-specific mode or not
}

dictionary AudioEncoderSettings : AudioEncoderStaticSettings, AudioEncoderDynamicSettings {
}

dictionary AudioEncoderInit : AudioCodecParameters {
  AudioEncoderSettings? settings;
}

dictionary AudioEncoderInput {
  MediaTime timestamp;
  ArrayBuffer buffer;
  AudioEncoderDynamicSettings changeSettings;
}

dictionary AudioEncoderOutput {
  Uint8Array data;
  MediaTime timestamp;
  // TODO: encode stats.
}


// Video encoder and decoder interfaces.

interface VideoFrame {
  readonly attribute MediaTime timestamp;
  readonly attribute ImageData imageData;
}

dictionary VideoCodecParameters {
  DOMString codec;

  // For VP9:
  DOMString? profile;
}

[Constructor(VideoDecoderInit init)]
interface VideoDecoder {
  readonly attribute WritableStream writable;  // VideoDecoderInput
  readonly attribute ReadableStream readable;  // VideoDecoderOutput
}

[Constructor(VideoEncoderInit init)]
interface VideoEncoder {
  readonly attribute WritableStream writable;  // VideoEncoderInput
  readonly attribute ReadableStream readable;  // VideoEncoderOutput
}

dictionary VideoDecoderInit : VideoCodecParameters {
  // Optional byte data required to initialize video decoders
  // such as H264 with SPS and PPS.
  BufferSource? extraData;

  // Can be used to initialize the decoder faster
  // than waiting for the first frame
  unsigned long long? expectedWidth;
  unsigned long long? expectedHeight;  
}

dictionary VideoDecoderInput {
  Uint8Array data;
  MediaTime timestamp;
}

dictionary VideoDecoderOutput {
  VideoFrame frame;
  // TODO: add decode stats.
}

enum VideoEncodeContentMode {
  "screen"  // For screen sharing/recording
  "default"  // Everything else
}

dictionary VideoEncodeLayer {
  // Referenced in .dependsOn and EncodedVideoFrame.encoded.layerId
  DOMString id;

  // Identifies the temporal slots this layer applies to
  // For example, with two temporal layers, typically one
  // Layer will have [0] and one [1].
  // But in more complex patterns with 4 temporal slots
  // One layer might fit in [0], another [2], and another [1, 3]

  sequence<unsigned long> temporalSlots;

  // The layer IDs of the layers this layer depends on.
  // Note that you likely want a layer to depend on itself
  // Otherwise, a base layer would be all key frames.
  sequence<DOMString> dependsOn;

  // How much to scale down resolution before encoding
  double scaleDownBy;

  // If unset, the codec will guess how much you want
  // (Diving up the total bitrate between the layers based on size 
  // and framerate)
  unsigned long? bitsPerSecond;
}

dictionary VideoEncoderStaticSettings {
  // Can be used to initialize the encoder faster
  // than waiting for the first frame
  unsigned long? expectedWidth;
  unsigned long? expectedHeight;  

  sequence<VideoEncodeLayer> layers;
}

dictionary VideoEncoderDynamicSettings {
  // unset/null means the encoder will pick
  // target will be exceeded for key frames
  unsigned long long? targetBitRate;

  VideoEncodeContentMode contentMode;

  // If true, the next compatible image will be encoded as a key frame.
  boolean? requestKeyFrame;
}

dictionary VideoEncoderSettings : VideoEncoderStaticCodecSettings, VideoEncoderDynamicCodecSettings {
}

dictionary VideoEncoderInit : VideoCodecParameters {
  VideoEncoderSettings settings;
}

dictionary VideoEncoderInput {
  VideoFrame frame;
  VideoEncoderDynamicSettings changeSettings;
}

dictionary VideoEncoderOutput {
  Uint8Array data;
  MediaTime timestamp;

  // If using multiple layers, which layer is it?
  DOMString? layerId;
  // Whether or not it's a key frame meaning it depends on 
  // no other frames
  boolean keyFrame;

  // TODO: per-frame encode stats.
}


// MediaStreamTrack integration.

[Constructor(MediaStreamTrack track)]
interface AudioTrackReader {
  readonly attribute ReadableStream readable; // of AudioEncoderInput
}

[Constructor()]
interface AudioTrackWriter {
  readonly attribute WritableStream writable; // of AudioDecoderOutput
  readonly attribute MediaStreamTrack track;
}

[Constructor(MediaStreamTrack track)]
interface VideoTrackReader {
  readonly attribute ReadableStream readable; // of VideoEncoderInput
}

[Constructor()]
interface VideoTrackWriter {
  readonly attribute WritableStream writable;  // VideoDecoderOutput
  readonly attribute MediaStreamTrack track;
}