From 32a9e2d4d8a1e931ad7bd701877f7a4150ba9d19 Mon Sep 17 00:00:00 2001 From: Joshua Lochner Date: Tue, 26 Dec 2023 19:25:08 +0200 Subject: [PATCH] Add support for Segformer --- README.md | 1 + docs/snippets/6_supported-models.snippet | 1 + scripts/supported_models.py | 40 ++++++++++++-- src/models.js | 38 ++++++++++++++ src/pipelines.js | 23 ++++++++- src/processors.js | 66 ++++++++++++++++++++++++ 6 files changed, 164 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 307f92e53..f926cdd49 100644 --- a/README.md +++ b/README.md @@ -326,6 +326,7 @@ You can refine your search by selecting the task you're interested in (e.g., [te 1. **[ResNet](https://huggingface.co/docs/transformers/model_doc/resnet)** (from Microsoft Research) released with the paper [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) by Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun. 1. **[RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta)** (from Facebook), released together with the paper [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov. 1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (from ZhuiyiTechnology), released together with the paper [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu. +1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo. 1. **[SpeechT5](https://huggingface.co/docs/transformers/model_doc/speecht5)** (from Microsoft Research) released with the paper [SpeechT5: Unified-Modal Encoder-Decoder Pre-Training for Spoken Language Processing](https://arxiv.org/abs/2110.07205) by Junyi Ao, Rui Wang, Long Zhou, Chengyi Wang, Shuo Ren, Yu Wu, Shujie Liu, Tom Ko, Qing Li, Yu Zhang, Zhihua Wei, Yao Qian, Jinyu Li, Furu Wei. 1. **[SqueezeBERT](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer. 1. **[Swin Transformer](https://huggingface.co/docs/transformers/model_doc/swin)** (from Microsoft) released with the paper [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) by Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo. diff --git a/docs/snippets/6_supported-models.snippet b/docs/snippets/6_supported-models.snippet index 7fdbe4c18..1ea5e15df 100644 --- a/docs/snippets/6_supported-models.snippet +++ b/docs/snippets/6_supported-models.snippet @@ -61,6 +61,7 @@ 1. **[ResNet](https://huggingface.co/docs/transformers/model_doc/resnet)** (from Microsoft Research) released with the paper [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) by Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun. 1. **[RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta)** (from Facebook), released together with the paper [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov. 1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (from ZhuiyiTechnology), released together with the paper [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu. +1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo. 1. **[SpeechT5](https://huggingface.co/docs/transformers/model_doc/speecht5)** (from Microsoft Research) released with the paper [SpeechT5: Unified-Modal Encoder-Decoder Pre-Training for Spoken Language Processing](https://arxiv.org/abs/2110.07205) by Junyi Ao, Rui Wang, Long Zhou, Chengyi Wang, Shuo Ren, Yu Wu, Shujie Liu, Tom Ko, Qing Li, Yu Zhang, Zhihua Wei, Yao Qian, Jinyu Li, Furu Wei. 1. **[SqueezeBERT](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer. 1. **[Swin Transformer](https://huggingface.co/docs/transformers/model_doc/swin)** (from Microsoft) released with the paper [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) by Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo. diff --git a/scripts/supported_models.py b/scripts/supported_models.py index 8e8946296..9ac52ba8a 100644 --- a/scripts/supported_models.py +++ b/scripts/supported_models.py @@ -362,7 +362,7 @@ 'distilbert-base-cased', ], }, - 'dit': { # NOTE: DiT has the same architecture as BEiT. + 'dit': { # NOTE: DiT has the same architecture as BEiT. # Feature extraction # NOTE: requires --task feature-extraction 'feature-extraction': [ @@ -680,8 +680,8 @@ 'hf-tiny-model-private/tiny-random-RoFormerForTokenClassification', ], - # TODO - # # Text generation + # TODO + # # Text generation # 'text-generation': [ # 'hf-tiny-model-private/tiny-random-RoFormerForCausalLM', # ], @@ -736,6 +736,40 @@ # 'facebook/sam-vit-large', # 'facebook/sam-vit-huge', # ], + 'segformer': { + # Image segmentation + 'image-segmentation': [ + 'mattmdjaga/segformer_b0_clothes', + 'mattmdjaga/segformer_b2_clothes', + 'jonathandinu/face-parsing', + + 'nvidia/segformer-b0-finetuned-cityscapes-768-768', + 'nvidia/segformer-b0-finetuned-cityscapes-512-1024', + 'nvidia/segformer-b0-finetuned-cityscapes-640-1280', + 'nvidia/segformer-b0-finetuned-cityscapes-1024-1024', + 'nvidia/segformer-b1-finetuned-cityscapes-1024-1024', + 'nvidia/segformer-b2-finetuned-cityscapes-1024-1024', + 'nvidia/segformer-b3-finetuned-cityscapes-1024-1024', + 'nvidia/segformer-b4-finetuned-cityscapes-1024-1024', + 'nvidia/segformer-b5-finetuned-cityscapes-1024-1024', + 'nvidia/segformer-b0-finetuned-ade-512-512', + 'nvidia/segformer-b1-finetuned-ade-512-512', + 'nvidia/segformer-b2-finetuned-ade-512-512', + 'nvidia/segformer-b3-finetuned-ade-512-512', + 'nvidia/segformer-b4-finetuned-ade-512-512', + 'nvidia/segformer-b5-finetuned-ade-640-640', + ], + + # Image classification + 'image-classification': [ + 'nvidia/mit-b0', + 'nvidia/mit-b1', + 'nvidia/mit-b2', + 'nvidia/mit-b3', + 'nvidia/mit-b4', + 'nvidia/mit-b5', + ], + }, 'speecht5': { # Text-to-audio/Text-to-speech diff --git a/src/models.js b/src/models.js index cc3345cda..1c4d485e3 100644 --- a/src/models.js +++ b/src/models.js @@ -4736,6 +4736,27 @@ export class VitsModel extends VitsPreTrainedModel { } ////////////////////////////////////////////////// +////////////////////////////////////////////////// +// Segformer models +export class SegformerPreTrainedModel extends PreTrainedModel { } + +/** + * The bare SegFormer encoder (Mix-Transformer) outputting raw hidden-states without any specific head on top. + */ +export class SegformerModel extends SegformerPreTrainedModel { } + +/** + * SegFormer Model transformer with an image classification head on top (a linear layer on top of the final hidden states) e.g. for ImageNet. + */ +export class SegformerForImageClassification extends SegformerPreTrainedModel { } + +/** + * SegFormer Model transformer with an all-MLP decode head on top e.g. for ADE20k, CityScapes. + */ +export class SegformerForSemanticSegmentation extends SegformerPreTrainedModel { } + +////////////////////////////////////////////////// + ////////////////////////////////////////////////// // AutoModels, used to simplify construction of PreTrainedModels @@ -5020,6 +5041,7 @@ const MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES = new Map([ ['dinov2', ['Dinov2ForImageClassification', Dinov2ForImageClassification]], ['resnet', ['ResNetForImageClassification', ResNetForImageClassification]], ['swin', ['SwinForImageClassification', SwinForImageClassification]], + ['segformer', ['SegformerForImageClassification', SegformerForImageClassification]], ]); const MODEL_FOR_OBJECT_DETECTION_MAPPING_NAMES = new Map([ @@ -5036,6 +5058,10 @@ const MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES = new Map([ ['detr', ['DetrForSegmentation', DetrForSegmentation]], ]); +const MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES = new Map([ + ['segformer', ['SegformerForSemanticSegmentation', SegformerForSemanticSegmentation]], +]); + const MODEL_FOR_MASK_GENERATION_MAPPING_NAMES = new Map([ ['sam', ['SamModel', SamModel]], ]); @@ -5081,6 +5107,7 @@ const MODEL_CLASS_TYPE_MAPPING = [ [MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES, MODEL_TYPES.Vision2Seq], [MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly], [MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly], + [MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly], [MODEL_FOR_IMAGE_MATTING_MAPPING_NAMES, MODEL_TYPES.EncoderOnly], [MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES, MODEL_TYPES.EncoderOnly], [MODEL_FOR_DEPTH_ESTIMATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly], @@ -5260,6 +5287,17 @@ export class AutoModelForImageSegmentation extends PretrainedMixin { static MODEL_CLASS_MAPPINGS = [MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES]; } +/** + * Helper class which is used to instantiate pretrained image segmentation models with the `from_pretrained` function. + * The chosen model class is determined by the type specified in the model config. + * + * @example + * let model = await AutoModelForSemanticSegmentation.from_pretrained('nvidia/segformer-b3-finetuned-cityscapes-1024-1024'); + */ +export class AutoModelForSemanticSegmentation extends PretrainedMixin { + static MODEL_CLASS_MAPPINGS = [MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES]; +} + /** * Helper class which is used to instantiate pretrained object detection models with the `from_pretrained` function. * The chosen model class is determined by the type specified in the model config. diff --git a/src/pipelines.js b/src/pipelines.js index d65b9e189..1b7404557 100644 --- a/src/pipelines.js +++ b/src/pipelines.js @@ -33,6 +33,7 @@ import { AutoModelForVision2Seq, AutoModelForImageClassification, AutoModelForImageSegmentation, + AutoModelForSemanticSegmentation, AutoModelForObjectDetection, AutoModelForZeroShotObjectDetection, AutoModelForDocumentQuestionAnswering, @@ -1710,8 +1711,26 @@ export class ImageSegmentationPipeline extends Pipeline { } } else if (subtask === 'semantic') { - throw Error(`semantic segmentation not yet supported.`); + const { segmentation, labels } = fn(output, target_sizes ?? imageSizes)[0]; + const id2label = this.model.config.id2label; + + for (let label of labels) { + const maskData = new Uint8ClampedArray(segmentation.data.length); + for (let i = 0; i < segmentation.data.length; ++i) { + if (segmentation.data[i] === label) { + maskData[i] = 255; + } + } + + const mask = new RawImage(maskData, segmentation.dims[1], segmentation.dims[0], 1); + + annotation.push({ + score: null, + label: id2label[label], + mask: mask + }); + } } else { throw Error(`Subtask ${subtask} not supported.`); } @@ -2488,7 +2507,7 @@ const SUPPORTED_TASKS = { "image-segmentation": { // no tokenizer "pipeline": ImageSegmentationPipeline, - "model": AutoModelForImageSegmentation, + "model": [AutoModelForImageSegmentation, AutoModelForSemanticSegmentation], "processor": AutoProcessor, "default": { // TODO: replace with original diff --git a/src/processors.js b/src/processors.js index c932f1810..edf48afb0 100644 --- a/src/processors.js +++ b/src/processors.js @@ -609,6 +609,71 @@ export class ImageFeatureExtractor extends FeatureExtractor { } +export class SegformerFeatureExtractor extends ImageFeatureExtractor { + + /** + * Converts the output of `SegformerForSemanticSegmentation` into semantic segmentation maps. + * @param {*} outputs Raw outputs of the model. + * @param {number[][]} [target_sizes=null] List of tuples corresponding to the requested final size + * (height, width) of each prediction. If unset, predictions will not be resized. + * @returns {{segmentation: Tensor; labels: number[]}[]} The semantic segmentation maps. + */ + post_process_semantic_segmentation(outputs, target_sizes = null) { + + const logits = outputs.logits; + const batch_size = logits.dims[0]; + + if (target_sizes !== null && target_sizes.length !== batch_size) { + throw Error("Make sure that you pass in as many target sizes as the batch dimension of the logits") + } + + const toReturn = []; + for (let i = 0; i < batch_size; ++i) { + const target_size = target_sizes !== null ? target_sizes[i] : null; + + let data = logits[i]; + + // 1. If target_size is not null, we need to resize the masks to the target size + if (target_size !== null) { + // resize the masks to the target size + data = interpolate(data, target_size, 'bilinear', false); + } + const [height, width] = target_size ?? data.dims.slice(-2); + + const segmentation = new Tensor( + 'int32', + new Int32Array(height * width), + [height, width] + ); + + // Buffer to store current largest value + const buffer = data[0].data; + for (let j = 1; j < data.dims[0]; ++j) { + const row = data[j].data; + for (let k = 0; k < row.length; ++k) { + if (row[k] > buffer[k]) { + buffer[k] = row[k]; + segmentation.data[k] = j; + } + } + } + + // Store which objects have labels + // This is much more efficient that creating a set of the final values + const hasLabel = new Array(data.dims[0]); + const out = segmentation.data; + for (let j = 0; j < out.length; ++j) { + const index = out[j]; + hasLabel[index] = index; + } + /** @type {number[]} The unique list of labels that were detected */ + const labels = hasLabel.filter(x => x !== undefined); + + toReturn.push({ segmentation, labels }); + } + return toReturn; + } +} export class BitImageProcessor extends ImageFeatureExtractor { } export class DPTFeatureExtractor extends ImageFeatureExtractor { } export class GLPNFeatureExtractor extends ImageFeatureExtractor { } @@ -1699,6 +1764,7 @@ export class AutoProcessor { ChineseCLIPFeatureExtractor, ConvNextFeatureExtractor, ConvNextImageProcessor, + SegformerFeatureExtractor, BitImageProcessor, DPTFeatureExtractor, GLPNFeatureExtractor,