diff --git a/src/transformers/pipelines/zero_shot_audio_classification.py b/src/transformers/pipelines/zero_shot_audio_classification.py
index 59500d14e104e7..8ed339a5b7f889 100644
--- a/src/transformers/pipelines/zero_shot_audio_classification.py
+++ b/src/transformers/pipelines/zero_shot_audio_classification.py
@@ -78,16 +78,17 @@ def __call__(self, audios: Union[np.ndarray, bytes, str], **kwargs):
                 - A string containing a local path to an audio
                 - An audio loaded in numpy
             candidate_labels (`List[str]`):
-                The candidate labels for this audio
+                The candidate labels for this audio. They will be formatted using *hypothesis_template*.
             hypothesis_template (`str`, *optional*, defaults to `"This is a sound of {}"`):
-                The sentence used in cunjunction with *candidate_labels* to attempt the audio classification by
-                replacing the placeholder with the candidate_labels. Then likelihood is estimated by using
-                logits_per_audio
+                The format used in conjunction with *candidate_labels* to attempt the audio classification by
+                replacing the placeholder with the candidate_labels. Pass "{}" if *candidate_labels* are
+                already formatted.
         Return:
-            A list of dictionaries containing result, one dictionary per proposed label. The dictionaries contain the
+            A list of dictionaries containing one entry per proposed label. Each dictionary contains the
             following keys:
-            - **label** (`str`) -- The label identified by the model. It is one of the suggested `candidate_label`.
-            - **score** (`float`) -- The score attributed by the model for that label (between 0 and 1).
+            - **label** (`str`) -- One of the suggested *candidate_labels*.
+            - **score** (`float`) -- The score attributed by the model to that label. It is a value between
+                0 and 1, computed as the `softmax` of `logits_per_audio`.
         """
         return super().__call__(audios, **kwargs)
 
diff --git a/src/transformers/pipelines/zero_shot_image_classification.py b/src/transformers/pipelines/zero_shot_image_classification.py
index 61d5ff52227559..b7e13e782e78cc 100644
--- a/src/transformers/pipelines/zero_shot_image_classification.py
+++ b/src/transformers/pipelines/zero_shot_image_classification.py
@@ -86,12 +86,12 @@ def __call__(self, images: Union[str, List[str], "Image", List["Image"]], **kwar
                 - An image loaded in PIL directly
 
             candidate_labels (`List[str]`):
-                The candidate labels for this image
+                The candidate labels for this image. They will be formatted using *hypothesis_template*.
 
             hypothesis_template (`str`, *optional*, defaults to `"This is a photo of {}"`):
-                The sentence used in cunjunction with *candidate_labels* to attempt the image classification by
-                replacing the placeholder with the candidate_labels. Then likelihood is estimated by using
-                logits_per_image
+                The format used in conjunction with *candidate_labels* to attempt the image classification by
+                replacing the placeholder with the candidate_labels. Pass "{}" if *candidate_labels* are
+                already formatted.
 
             timeout (`float`, *optional*, defaults to None):
                 The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
@@ -101,11 +101,11 @@ def __call__(self, images: Union[str, List[str], "Image", List["Image"]], **kwar
                 Additional dictionary of keyword arguments passed along to the tokenizer.
 
         Return:
-            A list of dictionaries containing result, one dictionary per proposed label. The dictionaries contain the
+            A list of dictionaries containing one entry per proposed label. Each dictionary contains the
             following keys:
-
-            - **label** (`str`) -- The label identified by the model. It is one of the suggested `candidate_label`.
-            - **score** (`float`) -- The score attributed by the model for that label (between 0 and 1).
+            - **label** (`str`) -- One of the suggested *candidate_labels*.
+            - **score** (`float`) -- The score attributed by the model to that label. It is a value between
+                0 and 1, computed as the `softmax` of `logits_per_image`.
         """
         return super().__call__(images, **kwargs)