From 9b659f5dc330f978d0bb871611a7a04270d24ba7 Mon Sep 17 00:00:00 2001 From: Arturo Bernal Date: Sun, 22 Sep 2024 21:00:17 +0200 Subject: [PATCH] HTTPCLIENT-2159: Fix handling of charset in ContentType for specific media types * Updated ContentType to ensure that no charset is included for media types like application/octet-stream, multipart/form-data, and image/*, which do not require a charset as per the RFC. * Refactored the toString() method to properly handle the omission of charset for these media types. * Adjusted the creation methods to better handle implicit charsets and added validation for reserved characters in MIME types. --- .../org/apache/hc/core5/http/ContentType.java | 192 ++++++++++++++---- .../apache/hc/core5/http/TestContentType.java | 117 +++++++++++ .../http/io/entity/TestStringEntity.java | 2 +- 3 files changed, 270 insertions(+), 41 deletions(-) diff --git a/httpcore5/src/main/java/org/apache/hc/core5/http/ContentType.java b/httpcore5/src/main/java/org/apache/hc/core5/http/ContentType.java index b8e049961..9ce5121bb 100644 --- a/httpcore5/src/main/java/org/apache/hc/core5/http/ContentType.java +++ b/httpcore5/src/main/java/org/apache/hc/core5/http/ContentType.java @@ -41,6 +41,7 @@ import org.apache.hc.core5.annotation.Contract; import org.apache.hc.core5.annotation.ThreadingBehavior; +import org.apache.hc.core5.http.message.BasicHeaderValueFormatter; import org.apache.hc.core5.http.message.BasicNameValuePair; import org.apache.hc.core5.http.message.MessageSupport; import org.apache.hc.core5.http.message.ParserCursor; @@ -67,105 +68,125 @@ public final class ContentType implements Serializable { */ private static final String CHARSET = "charset"; + /** + * Flag indicating whether the charset is implicit. + *

+ * When {@code implicitCharset} is {@code true}, the charset will not be explicitly + * included in the string representation of this {@link ContentType} (i.e., in the {@code toString} method), + * unless it is required for the given MIME type. + * If {@code implicitCharset} is {@code false}, the charset will always be included in the string representation, + * unless the MIME type explicitly disallows charset parameters (e.g., certain binary or multipart types). + *

+ *

+ * This flag is essential for proper handling of content types where the charset is either implied by the specification + * (e.g., JSON is always UTF-8) or where including a charset is not meaningful (e.g., binary types like + * {@code application/octet-stream}). + *

+ * + * @since 5.5 + */ + private final boolean implicitCharset; + + // constants public static final ContentType APPLICATION_ATOM_XML = create( - "application/atom+xml", StandardCharsets.UTF_8); + "application/atom+xml", StandardCharsets.UTF_8, false); public static final ContentType APPLICATION_FORM_URLENCODED = create( - "application/x-www-form-urlencoded", StandardCharsets.ISO_8859_1); + "application/x-www-form-urlencoded", StandardCharsets.ISO_8859_1, true); public static final ContentType APPLICATION_JSON = create( - "application/json", StandardCharsets.UTF_8); + "application/json", StandardCharsets.UTF_8, true); /** * Public constant media type for {@code application/x-ndjson}. * @since 5.1 */ public static final ContentType APPLICATION_NDJSON = create( - "application/x-ndjson", StandardCharsets.UTF_8); + "application/x-ndjson", StandardCharsets.UTF_8, true); public static final ContentType APPLICATION_OCTET_STREAM = create( - "application/octet-stream", (Charset) null); + "application/octet-stream", (Charset) null, true); /** * Public constant media type for {@code application/pdf}. * @since 5.1 */ public static final ContentType APPLICATION_PDF = create( - "application/pdf", StandardCharsets.UTF_8); + "application/pdf", (Charset) null, true); public static final ContentType APPLICATION_SOAP_XML = create( - "application/soap+xml", StandardCharsets.UTF_8); + "application/soap+xml", StandardCharsets.UTF_8, false); public static final ContentType APPLICATION_SVG_XML = create( - "application/svg+xml", StandardCharsets.UTF_8); + "application/svg+xml", StandardCharsets.UTF_8, false); public static final ContentType APPLICATION_XHTML_XML = create( - "application/xhtml+xml", StandardCharsets.UTF_8); + "application/xhtml+xml", StandardCharsets.UTF_8, false); public static final ContentType APPLICATION_XML = create( - "application/xml", StandardCharsets.UTF_8); + "application/xml", StandardCharsets.UTF_8, false); /** * Public constant media type for {@code application/problem+json}. * @see Problem Details for HTTP APIs, 6.1. application/problem+json * @since 5.1 */ public static final ContentType APPLICATION_PROBLEM_JSON = create( - "application/problem+json", StandardCharsets.UTF_8); + "application/problem+json", StandardCharsets.UTF_8, true); /** * Public constant media type for {@code application/problem+xml}. * @see Problem Details for HTTP APIs, 6.2. application/problem+xml * @since 5.1 */ public static final ContentType APPLICATION_PROBLEM_XML = create( - "application/problem+xml", StandardCharsets.UTF_8); + "application/problem+xml", StandardCharsets.UTF_8, false); /** * Public constant media type for {@code application/rss+xml}. * @since 5.1 */ public static final ContentType APPLICATION_RSS_XML = create( - "application/rss+xml", StandardCharsets.UTF_8); + "application/rss+xml", StandardCharsets.UTF_8, false); public static final ContentType IMAGE_BMP = create( - "image/bmp"); + "image/bmp", (Charset) null, true); public static final ContentType IMAGE_GIF = create( - "image/gif"); + "image/gif", (Charset) null, true); public static final ContentType IMAGE_JPEG = create( - "image/jpeg"); + "image/jpeg", (Charset) null, true); public static final ContentType IMAGE_PNG = create( - "image/png"); + "image/png", (Charset) null, true); public static final ContentType IMAGE_SVG = create( - "image/svg+xml"); + "image/svg+xml", (Charset) null, false); public static final ContentType IMAGE_TIFF = create( - "image/tiff"); + "image/tiff", (Charset) null, true); public static final ContentType IMAGE_WEBP = create( - "image/webp"); + "image/webp", (Charset) null, true); public static final ContentType MULTIPART_FORM_DATA = create( - "multipart/form-data", StandardCharsets.ISO_8859_1); + "multipart/form-data", StandardCharsets.ISO_8859_1, true); /** * Public constant media type for {@code multipart/mixed}. * @since 5.1 */ public static final ContentType MULTIPART_MIXED = create( - "multipart/mixed", StandardCharsets.ISO_8859_1); + "multipart/mixed", StandardCharsets.ISO_8859_1, true); /** * Public constant media type for {@code multipart/related}. * @since 5.1 */ public static final ContentType MULTIPART_RELATED = create( - "multipart/related", StandardCharsets.ISO_8859_1); + "multipart/related", StandardCharsets.ISO_8859_1, true); public static final ContentType TEXT_HTML = create( - "text/html", StandardCharsets.UTF_8); + "text/html", StandardCharsets.UTF_8, true); /** * Public constant media type for {@code text/markdown}. * @since 5.1 */ public static final ContentType TEXT_MARKDOWN = create( - "text/markdown", StandardCharsets.UTF_8); + "text/markdown", StandardCharsets.UTF_8, false); public static final ContentType TEXT_PLAIN = create( - "text/plain", StandardCharsets.UTF_8); + "text/plain", StandardCharsets.UTF_8, true); public static final ContentType TEXT_XML = create( - "text/xml", StandardCharsets.UTF_8); + "text/xml", StandardCharsets.UTF_8, false); /** * Public constant media type for {@code text/event-stream}. * @see Server-Sent Events W3C recommendation @@ -175,7 +196,7 @@ public final class ContentType implements Serializable { "text/event-stream", StandardCharsets.UTF_8); public static final ContentType WILDCARD = create( - "*/*", (Charset) null); + "*/*", (Charset) null, true); /** * An empty immutable {@code NameValuePair} array. @@ -225,18 +246,42 @@ public final class ContentType implements Serializable { ContentType( final String mimeType, final Charset charset) { - this.mimeType = mimeType; - this.charset = charset; - this.params = null; + this (mimeType,charset,null, false); } ContentType( final String mimeType, final Charset charset, final NameValuePair[] params) { + + this (mimeType,charset,params, false); + } + + /** + * Constructs a new instance of {@link ContentType} with the given MIME type, charset, parameters, + * and an implicit charset flag. + *

+ * If {@code implicitCharset} is set to {@code true}, the charset will not be explicitly + * included in the string representation of this content type (i.e., the {@code toString} method) + * unless it is required for the given MIME type. + * If {@code implicitCharset} is {@code false}, the charset will always be included in the + * string representation unless the MIME type is one of those that should not include a charset. + *

+ * + * @param mimeType the MIME type. It must not be {@code null} or empty and must not contain + * reserved characters such as {@code <">, <;>, <,>}. + * @param charset the character set for this content type. This can be {@code null}. + * @param params optional parameters for this content type. If {@code null}, no additional + * parameters will be included. + * @param implicitCharset whether the charset is implicit. If {@code true}, the charset is not + * included in the {@code toString} output unless required. + * @since 5.5 + */ + ContentType(final String mimeType, final Charset charset, final NameValuePair[] params, final boolean implicitCharset) { this.mimeType = mimeType; this.charset = charset; - this.params = params; + this.implicitCharset = implicitCharset; + this.params = params != null ? params.clone() : null; } public String getMimeType() { @@ -288,8 +333,19 @@ public String toString() { buf.append(this.mimeType); if (this.params != null) { buf.append("; "); - MessageSupport.formatParameters(buf, this.params); - } else if (this.charset != null) { + boolean first = true; + for (final NameValuePair param : params) { + if (!first) { + buf.append("; "); + } + if (param.getName().equalsIgnoreCase("charset") && implicitCharset) { + continue; + } + BasicHeaderValueFormatter.INSTANCE.formatNameValuePair(buf, param, false); + first = false; + } + } else if (this.charset != null && !implicitCharset) { + // Append charset only if it's not one of the types that shouldn't have charset buf.append("; charset="); buf.append(this.charset.name()); } @@ -306,6 +362,58 @@ private static boolean valid(final String s) { return true; } + + /** + * Creates a new instance of {@link ContentType} with the given MIME type, charset, + * and an implicit charset flag. + *

+ * This method allows specifying whether the charset should be implicit or explicit. + * If {@code implicitCharset} is set to {@code true}, the charset will not be explicitly + * included in the string representation of this content type (i.e., the {@code toString} method), + * unless it is required for the given MIME type. If {@code implicitCharset} is {@code false}, + * the charset will always be included unless the MIME type does not allow a charset. + *

+ * + * @param mimeType the MIME type. It must not be {@code null} or empty and must not contain + * reserved characters such as {@code <">, <;>, <,>}. + * @param charset the character set for this content type. This can be {@code null}. + * @param implicitCharset whether the charset is implicit. If {@code true}, the charset is + * not included in the {@code toString} output unless required. + * @return a new instance of {@link ContentType}. + * @throws IllegalArgumentException if the MIME type is invalid or contains reserved characters. + * @since 5.5 + */ + public static ContentType create(final String mimeType, final Charset charset, final boolean implicitCharset) { + final String normalizedMimeType = TextUtils.toLowerCase(Args.notBlank(mimeType, "MIME type")); + Args.check(valid(normalizedMimeType), "MIME type may not contain reserved characters"); + return new ContentType(normalizedMimeType, charset, null, implicitCharset); + } + + /** + * Creates a new instance of {@link ContentType} with the given MIME type, parameters, + * and an implicit charset flag. + *

+ * This method allows specifying additional parameters for the content type and whether + * the charset should be implicit or explicit. If {@code implicitCharset} is {@code true}, + * the charset will not be included in the string representation unless required. + *

+ * + * @param mimeType the MIME type. It must not be {@code null} or empty and must not contain + * reserved characters such as {@code <">, <;>, <,>}. + * @param implicitCharset whether the charset is implicit. If {@code true}, the charset is + * not included in the {@code toString} output unless required. + * @param params optional parameters for the content type. Can be {@code null}. + * @return a new instance of {@link ContentType}. + * @throws IllegalArgumentException if the MIME type is invalid or contains reserved characters. + * @throws UnsupportedCharsetException if the charset provided in the parameters is not supported. + * @since 5.5 + */ + public static ContentType create(final String mimeType, final boolean implicitCharset, final NameValuePair... params) throws UnsupportedCharsetException { + final String type = TextUtils.toLowerCase(Args.notBlank(mimeType, "MIME type")); + Args.check(valid(type), "MIME type may not contain reserved characters"); + return create(mimeType, params != null ? params.clone() : null, implicitCharset); + } + /** * Creates a new instance of {@link ContentType}. * @@ -315,9 +423,7 @@ private static boolean valid(final String s) { * @return content type */ public static ContentType create(final String mimeType, final Charset charset) { - final String normalizedMimeType = TextUtils.toLowerCase(Args.notBlank(mimeType, "MIME type")); - Args.check(valid(normalizedMimeType), "MIME type may not contain reserved characters"); - return new ContentType(normalizedMimeType, charset); + return create(mimeType, charset, false); } /** @@ -326,7 +432,9 @@ public static ContentType create(final String mimeType, final Charset charset) { * @param mimeType MIME type. It may not be {@code null} or empty. It may not contain * characters {@code <">, <;>, <,>} reserved by the HTTP specification. * @return content type + * @deprecated Use {@link #create(String, Charset, boolean)} with an explicit charset or null, and specify if the charset is implicit. */ + @Deprecated public static ContentType create(final String mimeType) { return create(mimeType, (Charset) null); } @@ -356,6 +464,10 @@ private static ContentType create(final HeaderElement helem, final boolean stric } private static ContentType create(final String mimeType, final NameValuePair[] params, final boolean strict) { + return create(mimeType, params != null ? params.clone() : null, strict, false); + } + + private static ContentType create(final String mimeType, final NameValuePair[] params, final boolean strict, final boolean implicitCharset) { Charset charset = null; if (params != null) { for (final NameValuePair param : params) { @@ -374,7 +486,7 @@ private static ContentType create(final String mimeType, final NameValuePair[] p } } } - return new ContentType(mimeType, charset, params != null && params.length > 0 ? params : null); + return new ContentType(mimeType, charset, params != null && params.length > 0 ? params : null, implicitCharset); } /** @@ -517,11 +629,11 @@ public ContentType withParameters( for (final Map.Entry entry: paramMap.entrySet()) { newParams.add(new BasicNameValuePair(entry.getKey(), entry.getValue())); } - return create(this.getMimeType(), newParams.toArray(EMPTY_NAME_VALUE_PAIR_ARRAY), true); + return create(this.getMimeType(), newParams.toArray(EMPTY_NAME_VALUE_PAIR_ARRAY), true, this.implicitCharset); } public boolean isSameMimeType(final ContentType contentType) { return contentType != null && mimeType.equalsIgnoreCase(contentType.getMimeType()); } -} +} \ No newline at end of file diff --git a/httpcore5/src/test/java/org/apache/hc/core5/http/TestContentType.java b/httpcore5/src/test/java/org/apache/hc/core5/http/TestContentType.java index cea1eae59..430af0c1b 100644 --- a/httpcore5/src/test/java/org/apache/hc/core5/http/TestContentType.java +++ b/httpcore5/src/test/java/org/apache/hc/core5/http/TestContentType.java @@ -183,4 +183,121 @@ void testWithParams() throws Exception { Assertions.assertEquals("text/blah; charset=ISO-8859-1; p=blah", contentType.toString()); } + + @Test + void testImplicitCharsetTrue() { + // ContentType with implicitCharset = true + final ContentType contentType = ContentType.create("application/json", StandardCharsets.UTF_8, true); + + // Check that the charset is not added to the toString() output + Assertions.assertEquals("application/json", contentType.toString()); + // Check that the charset is still stored + Assertions.assertEquals(StandardCharsets.UTF_8, contentType.getCharset()); + } + + @Test + void testImplicitCharsetFalse() { + // ContentType with implicitCharset = false + final ContentType contentType = ContentType.create("application/json", StandardCharsets.UTF_8, false); + + // Check that the charset is included in the toString() output + Assertions.assertEquals("application/json; charset=UTF-8", contentType.toString()); + // Check that the charset is correctly stored + Assertions.assertEquals(StandardCharsets.UTF_8, contentType.getCharset()); + } + + @Test + void testImplicitCharsetForTextPlain() { + // ContentType for text/plain with implicitCharset = true + final ContentType contentType = ContentType.create("text/plain", StandardCharsets.UTF_8, true); + + // Check that the charset is not included in the toString() output + Assertions.assertEquals("text/plain", contentType.toString()); + // Check that the charset is correctly stored + Assertions.assertEquals(StandardCharsets.UTF_8, contentType.getCharset()); + } + + @Test + void testWithParamsAndImplicitCharset() { + // ContentType with parameters and implicitCharset = true + ContentType contentType = ContentType.create("text/plain", StandardCharsets.UTF_8, true) + .withParameters( + new BasicNameValuePair("p", "this"), + new BasicNameValuePair("p", "that")); + + // Check that the last "p" parameter overwrites the first one + // ImplicitCharset is true, so charset should not be included + Assertions.assertEquals("text/plain; p=that", contentType.toString()); + + // Verify that charset is still available in the object + Assertions.assertEquals(StandardCharsets.UTF_8, contentType.getCharset()); + + // Test with implicitCharset = false + contentType = ContentType.create("text/plain", StandardCharsets.UTF_8, false) + .withParameters( + new BasicNameValuePair("p", "this"), + new BasicNameValuePair("p", "that")); + + // Check that the charset is included in the toString() output due to implicitCharset = false + Assertions.assertEquals("text/plain; charset=UTF-8; p=that", contentType.toString()); + } + + @Test + void testNoCharsetForSpecificMediaTypes() { + // Testing application/octet-stream should not include charset in toString + ContentType contentType = ContentType.create("application/octet-stream", StandardCharsets.UTF_8, true); + Assertions.assertEquals("application/octet-stream", contentType.toString()); + Assertions.assertNotNull(contentType.getCharset()); // Ensure charset is set + + // Testing image/jpeg should not include charset in toString + contentType = ContentType.create("image/jpeg", StandardCharsets.UTF_8, true); + Assertions.assertEquals("image/jpeg", contentType.toString()); + Assertions.assertNotNull(contentType.getCharset()); + + // Testing multipart/form-data should not include charset in toString + contentType = ContentType.create("multipart/form-data", StandardCharsets.UTF_8, true); + Assertions.assertEquals("multipart/form-data", contentType.toString()); + Assertions.assertNotNull(contentType.getCharset()); + } + + + @Test + void testCharsetForOtherMediaTypes() { + // Testing application/json should include charset + ContentType contentType = ContentType.create("application/json", StandardCharsets.UTF_8, false); + Assertions.assertEquals("application/json; charset=UTF-8", contentType.toString()); + Assertions.assertEquals(StandardCharsets.UTF_8, contentType.getCharset()); + + // Testing text/html should include charset + contentType = ContentType.create("text/html", StandardCharsets.UTF_8, false); + Assertions.assertEquals("text/html; charset=UTF-8", contentType.toString()); + Assertions.assertEquals(StandardCharsets.UTF_8, contentType.getCharset()); + } + + @Test + void testNoCharsetForBinaryMediaTypes() throws Exception { + // Test for application/octet-stream + ContentType contentType = ContentType.create("application/octet-stream", null, true); + Assertions.assertEquals("application/octet-stream", contentType.toString()); + Assertions.assertNull(contentType.getCharset()); + + // Test for image/jpeg + contentType = ContentType.create("image/jpeg", null, true); + Assertions.assertEquals("image/jpeg", contentType.toString()); + Assertions.assertNull(contentType.getCharset()); + } + + @Test + void testFormUrlEncodedWithoutCharset() throws Exception { + // Test for application/x-www-form-urlencoded with percent-encoding + final ContentType contentType = ContentType.create("application/x-www-form-urlencoded", null, true); + Assertions.assertEquals("application/x-www-form-urlencoded", contentType.toString()); + Assertions.assertNull(contentType.getCharset()); + + // Test body encoding example with percent-encoding + final String encodedBody = "echotext=TEST%F6TEST"; + // Simulate HTTP redirect where charset shouldn't change + final String redirectedBody = "echotext=TEST%F6TEST"; + Assertions.assertEquals(encodedBody, redirectedBody); // Ensure body stays the same after redirect + } } diff --git a/httpcore5/src/test/java/org/apache/hc/core5/http/io/entity/TestStringEntity.java b/httpcore5/src/test/java/org/apache/hc/core5/http/io/entity/TestStringEntity.java index f6e370a91..0b9cfa353 100644 --- a/httpcore5/src/test/java/org/apache/hc/core5/http/io/entity/TestStringEntity.java +++ b/httpcore5/src/test/java/org/apache/hc/core5/http/io/entity/TestStringEntity.java @@ -66,7 +66,7 @@ void testDefaultContent() throws Exception { httpentity = new StringEntity(s, StandardCharsets.US_ASCII); Assertions.assertEquals("text/plain; charset=US-ASCII", httpentity.getContentType()); httpentity = new StringEntity(s); - Assertions.assertEquals("text/plain; charset=UTF-8", httpentity.getContentType()); + Assertions.assertEquals("text/plain", httpentity.getContentType()); } private static String constructString(final int [] unicodeChars) {