Add naming conventions

opwvhk · Feb 20, 2024 · 513e27c · 513e27c
1 parent 8571eb3
commit 513e27c
Show file tree

Hide file tree

Showing 4 changed files with 946 additions and 497 deletions.
diff --git a/src/main/java/opwvhk/avro/SchemaManipulator.java b/src/main/java/opwvhk/avro/SchemaManipulator.java
@@ -1,5 +1,12 @@
 package opwvhk.avro;
 
+import net.jimblackler.jsonschemafriend.GenerationException;
+import opwvhk.avro.json.SchemaAnalyzer;
+import opwvhk.avro.util.AvroSchemaUtils;
+import opwvhk.avro.util.NamingConvention;
+import opwvhk.avro.xml.XsdAnalyzer;
+import org.apache.avro.Schema;
+
 import java.io.IOException;
 import java.io.InputStream;
 import java.net.URISyntaxException;
@@ -10,12 +17,7 @@
 import java.util.List;
 import java.util.Objects;
 import java.util.Set;
-
-import net.jimblackler.jsonschemafriend.GenerationException;
-import opwvhk.avro.json.SchemaAnalyzer;
-import opwvhk.avro.util.AvroSchemaUtils;
-import opwvhk.avro.xml.XsdAnalyzer;
-import org.apache.avro.Schema;
+import java.util.stream.Stream;
 
 import static java.util.Objects.requireNonNull;
 
@@ -29,11 +31,18 @@ public class SchemaManipulator {
 	private boolean renameWithAliases;
 	private StringBuilder markdownBuffer;
 	private List<SchemaRenamer> schemaRenamerList;
+	private SchemaRenamer schemaNamingConvention;
 	private List<FieldRenamer> fieldRenamerList;
+	private FieldRenamer fieldNamingConvention;
 	private List<UnwrapTest> unwrapTests;
 
-	private SchemaManipulator(Schema initialSchema) {
-		reset(initialSchema);
+	/**
+	 * Create a schema manipulator for a given schema.
+	 *
+	 * @param schema the schema to manipulate
+	 */
+	public SchemaManipulator(Schema schema) {
+		reset(schema);
 	}
 
 	private void reset(Schema initialSchema) {
@@ -42,7 +51,9 @@ private void reset(Schema initialSchema) {
 		renameWithAliases = true;
 		markdownBuffer = null;
 		schemaRenamerList = new ArrayList<>();
+		schemaNamingConvention = (pathToField, fieldSchema) -> null;
 		fieldRenamerList = new ArrayList<>();
+		fieldNamingConvention = (pathToField, schemaWithField, field) -> null;
 		unwrapTests = new ArrayList<>();
 	}
 
@@ -71,7 +82,8 @@ public static SchemaManipulator startFromAvro(URL schemaLocation) throws IOExcep
 	}
 
 	/**
-	 * Create a schema manipulator from an XML Schema Definition (XSD). The location of the main {@code .xsd} file is provided, both to provide the XSD content,
+	 * Create a schema manipulator from an XML Schema Definition (XSD). The location of the main {@code .xsd} file is provided, both to provide the XSD
+     * content,
 	 * as to provide a way to locate imported/included {@code .xsd} files.
 	 *
 	 * @param schemaLocation the location of the main {@code .xsd} file (it may include/import other {@code .xsd} files)
@@ -210,15 +222,15 @@ private Schema applySchemaChanges(IdentityHashMap<Schema, Schema> changedSchemas
 	}
 
 	private String newSchemaName(String path, Schema schema) {
-		return schemaRenamerList.stream()
+		return Stream.concat(schemaRenamerList.stream(), Stream.of(schemaNamingConvention))
 				.map(renamer -> renamer.newSchemaName(path, schema))
 				.filter(Objects::nonNull)
 				.findAny()
 				.orElse(null);
 	}
 
 	private String newFieldName(String path, Schema schemaWithField, Schema.Field field) {
-		return fieldRenamerList.stream()
+		return Stream.concat(fieldRenamerList.stream(), Stream.of(fieldNamingConvention))
 				.map(renamer -> renamer.newFieldName(path, schemaWithField, field))
 				.filter(Objects::nonNull)
 				.findAny()
@@ -297,6 +309,35 @@ public SchemaManipulator renameSchemaAtPath(String newSchemaName, String... path
 		return this;
 	}
 
+	/**
+	 * Use the specified naming convention for schemas. This naming convention applies for all schemas that have not been explicitly renamed using
+	 * {@link #renameSchema(String, String)} or {@link #renameSchemaAtPath(String, String...)}, and leaves the namespace name intact.
+	 *
+	 * @param schemaNamingConvention the naming convention to use
+	 * @return this {@code SchemaManipulator}
+	 */
+	public SchemaManipulator useSchemaNamingConvention(NamingConvention schemaNamingConvention) {
+		return useSchemaNamingConvention(NamingConvention.NULL, schemaNamingConvention);
+	}
+
+	/**
+	 * Use the specified naming conventions for schemas. These naming conventions apply for all schemas that have not been explicitly renamed using
+	 * {@link #renameSchema(String, String)} or {@link #renameSchemaAtPath(String, String...)}.
+	 *
+	 * @param namespaceNamingConvention the naming convention to use for the schema namespace
+	 * @param schemaNamingConvention    the naming convention to use for the schema (simple) name
+	 * @return this {@code SchemaManipulator}
+	 */
+	public SchemaManipulator useSchemaNamingConvention(NamingConvention namespaceNamingConvention, NamingConvention schemaNamingConvention) {
+		this.schemaNamingConvention = (path, schema) -> {
+			String namespace = schema.getNamespace();
+			String prefix = namespace == null ? "" : namespaceNamingConvention.convert(namespace) + ".";
+			String newFullName = prefix + schemaNamingConvention.convert(schema.getName());
+			return schema.getFullName().equals(newFullName) ? null : newFullName;
+		};
+		return this;
+	}
+
 	/**
 	 * Rename the specified field in the (named) schema.
 	 *
@@ -330,6 +371,22 @@ public SchemaManipulator renameFieldAtPath(String newFieldName, String... pathTo
 		return this;
 	}
 
+	/**
+	 * Use the specified naming conventions for fields. This naming convention applies for all fields that have not been explicitly renamed using
+	 * * {@link #renameField(String, String, String)} or {@link #renameFieldAtPath(String, String...)}.
+	 *
+	 * @param namingConvention the naming convention to use for the schema (simple) name
+	 * @return this {@code SchemaManipulator}
+	 */
+	public SchemaManipulator useFieldNamingConvention(NamingConvention namingConvention) {
+		this.fieldNamingConvention = (pathToField, schemaWithField, field) -> {
+			String oldName = field.name();
+			String newName = namingConvention.convert(oldName);
+			return oldName.equals(newName) ? null : newName;
+		};
+		return this;
+	}
+
 	/**
 	 * <p>Unwrap all arrays whose field names (except up to the last {@code ignoredMaxSuffixLength} characters) are equal.</p>
 	 *
@@ -384,7 +441,8 @@ public SchemaManipulator unwrapArray(String schemaName, String wrappingField) {
 	 * <p>Unwrap the array whose wrapping field is at the specified path.</p>
 	 *
 	 * <p>Wrapped arrays are an XML construct. They result in array fields without siblings in a record field (optionally in a union with null). In Avro,
-	 * Parquet, and in fact most/all other formats, they are both not needed and unwanted. This method unwraps them based on the path to the wrapping field.</p>
+	 * Parquet, and in fact most/all other formats, they are both not needed and unwanted. This method unwraps them based on the path to the wrapping field
+     * .</p>
 	 *
 	 * <p>When unwrapping, wrapped field will replace the wrapping field using the name of the wrapping field. As this is not a renaming action, no alias will
 	 * be added.</p>

diff --git a/src/main/java/opwvhk/avro/util/NamingConvention.java b/src/main/java/opwvhk/avro/util/NamingConvention.java
@@ -0,0 +1,238 @@
+/*
+ * Copyright © Oscar Westra van Holthe - Kind
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+package opwvhk.avro.util;
+
+import org.jetbrains.annotations.NotNull;
+
+import java.text.Normalizer;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Locale;
+import java.util.function.UnaryOperator;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * <p>A class to provide <a href="https://en.wikipedia.org/wiki/Naming_convention_(programming)#Multiple-word_identifiers">naming conventions for multiple-word
+ * identifiers</a>, like camel case, snake case, etc.</p>
+ *
+ * <p>It is up to the user of the class to ensure the applied naming convention makes sense: using e.g. camel case for a script that has no notion of
+ * upper/lower case letters is not useful.</p>
+ *
+ * <h2>Algorithm</h2>
+ *
+ * <p>Casing is applied by first sanitising the text, and determining the list of words. Then the words are put together according to the selected style.</p>
+ *
+ * <p>This algorithm is somewhat opinionated: it does not make any special exceptions for acronyms. This is mostly in line with general guidelines, such as
+ * from <a href="https://www.oracle.com/java/technologies/javase/codeconventions-namingconventions.html">Java/Oracle</a>
+ * and <a href="https://learn.microsoft.com/en-us/dotnet/standard/design-guidelines/capitalization-conventions">Microsoft</a>, but ignores the Microsoft
+ * exception for two-letter acronyms (like IO).</p>
+ *
+ * <p>Sanitation is done by creating a {@link Normalizer.Form#NFD canonical decomposition}, and then removing everything that is not in the
+ * <a href="https://www.unicode.org/reports/tr44/#General_Category_Values">unicode categories</a> letter (L), number (N), space separator (Zs), connector
+ * punctuation (Pc) or dash punctuation (Pd). This also removes accents. Words are then determined by splitting along spacing and punctuation.</p>
+ *
+ * <h2>Defined conventions</h2>
+ *
+ * <p>There are a number of capitalisation conventions predefined, combining various delimiters and combinations of upper and lower case, as listed below:</p>
+ *
+ * <table><caption>Capitalisation Conventions</caption><thead>
+ *     <tr><th>Convention</th><th>Example</th></tr>
+ * </thead><tbody>
+ *     <tr><td>Pascal Case</td><td>PascalCase</td></tr>
+ *     <tr><td>Camel Case</td><td>camelCase</td></tr>
+ *     <tr><td>Snake Case</td><td>snake_case</td></tr>
+ *     <tr><td>Kebab Case</td><td>kebab-case</td></tr>
+ *     <tr><td>Pascal Snake Case</td><td>Pascal_Snake_Case</td></tr>
+ *     <tr><td>Camel Snake Case</td><td>camel_Snake_Case</td></tr>
+ *     <tr><td>Screaming Snake Case</td><td>SCREAMING_SNAKE_CASE</td></tr>
+ *     <tr><td>Train Case</td><td>Train-Case</td></tr>
+ *     <tr><td>Cobol Case</td><td>COBOL-CASE</td></tr>
+ * </tbody></table>
+ *
+ * <p>Note that there is no predefined convention for (upper) flat case. The reason is that they are not reversible. The other conventions can be applied in any
+ * sequence, and the last one deterministically determines the result. If any convention in between uses flat case, this is no longer true.</p>
+ *
+ * @see <a href="https://en.wikipedia.org/wiki/Naming_convention_(programming)#Multiple-word_identifiers">Naming conventions for multiple-word identifiers</a>
+ * @see <a href="https://www.oracle.com/java/technologies/javase/codeconventions-namingconventions.html">Java naming convensions</a>
+ * @see <a href="https://learn.microsoft.com/en-us/dotnet/standard/design-guidelines/capitalization-conventions">Microsoft capitalization conventions</a>
+ * @see <a href="https://www.unicode.org/reports/tr44/#General_Category_Values">Unicode annex #44, General Category Values</a>
+ */
+public class NamingConvention {
+	/**
+	 * Pascal Case: capitalized words without delimiter.
+	 */
+	public static final NamingConvention PASCAL_CASE = new NamingConvention("", WordCase.CAPITALIZED, WordCase.CAPITALIZED);
+	/**
+	 * Camel Case: lowercase first word followed by capitalized words, without delimiter.
+	 */
+	public static final NamingConvention CAMEL_CASE = new NamingConvention("", WordCase.LOWER_CASE, WordCase.CAPITALIZED);
+	/**
+	 * Snake Case: lowercase words, separated by underscores.
+	 */
+	public static final NamingConvention SNAKE_CASE = new NamingConvention("_", WordCase.LOWER_CASE, WordCase.LOWER_CASE);
+	/**
+	 * Kebab Case: lowercase words, separated by hyphens.
+	 */
+	public static final NamingConvention KEBAB_CASE = new NamingConvention("-", WordCase.LOWER_CASE, WordCase.LOWER_CASE);
+	/**
+	 * Pascal Snake Case: capitalized words, separated by underscores.
+	 */
+	public static final NamingConvention PASCAL_SNAKE_CASE = new NamingConvention("_", WordCase.CAPITALIZED, WordCase.CAPITALIZED);
+	/**
+	 * Camel Snake Case: lowercase first word followed by capitalized words, separated by underscores.
+	 */
+	public static final NamingConvention CAMEL_SNAKE_CASE = new NamingConvention("_", WordCase.LOWER_CASE, WordCase.CAPITALIZED);
+	/**
+	 * Screaming Snake Case: uppercase words, separated by underscores.
+	 */
+	public static final NamingConvention SCREAMING_SNAKE_CASE = new NamingConvention("_", WordCase.UPPER_CASE, WordCase.UPPER_CASE);
+	/**
+	 * Train Case: capitalized words, separated by hyphens.
+	 */
+	public static final NamingConvention TRAIN_CASE = new NamingConvention("-", WordCase.CAPITALIZED, WordCase.CAPITALIZED);
+	/**
+	 * Cobol Case: uppercase words, separated by hyphens.
+	 */
+	public static final NamingConvention COBOL_CASE = new NamingConvention("-", WordCase.UPPER_CASE, WordCase.UPPER_CASE);
+	/**
+	 * Dummy naming convention that returns the given name as-is.
+	 */
+	public static final NamingConvention NULL = new NamingConvention(null, null, null) {
+		@Override
+		public String convert(String name) {
+			return name;
+		}
+	};
+
+	private final String delimiter;
+	private final WordCase firstWord;
+	private final WordCase otherWords;
+
+	/**
+	 * Create a naming convention for multiple-word identifiers. Combining an empty delimiter with {@link WordCase#LOWER_CASE} or
+	 * {@link WordCase#UPPER_CASE} is discouraged, as the result cannot be converted to another naming convention.
+	 *
+	 * @param delimiter the word delimiter to use
+	 * @param firstWord the capitalization for the first word
+	 * @param otherWords the capitalization for the other words
+	 */
+	public NamingConvention(String delimiter, WordCase firstWord, WordCase otherWords) {
+		this.delimiter = delimiter;
+		this.firstWord = firstWord;
+		this.otherWords = otherWords;
+	}
+
+	/**
+	 * Convert a text/name to a name in this name case.
+	 *
+	 * @param name the name to convert
+	 * @return the name in this name case
+	 */
+	public String convert(String name) {
+		// First remove accents, extra punctuation, etc. Keep only letters, numbers, and dash & combining punctuation.
+		String cleanName = NAME_CHARACTER_FILTER.matcher(Normalizer.normalize(name, Normalizer.Form.NFD)).replaceAll("");
+		// if (cleanName.isEmpty() )
+
+		// Then split by boundary characters, and determine the first non-empty word
+		List<String> words = splitToWords(DELIMITER_BOUNDARY, cleanName);
+		if (words.isEmpty()) {
+			throw new IllegalArgumentException("The name contains no letters or numbers");
+		} else if (words.size() == 1) {
+			// The name contains no boundary characters: maybe it is camel case.
+			words = splitToWords(CAMEL_BOUNDARY, cleanName);
+		}
+
+		StringBuilder buffer = new StringBuilder((int) (name.length() * 1.2f));
+		Iterator<String> iterator = words.iterator();
+		buffer.append(firstWord.apply(iterator.next()));
+		iterator.forEachRemaining(word -> buffer.append(delimiter).append(otherWords.apply(word)));
+		return buffer.toString();
+	}
+
+	/**
+	 * Pattern to match anything that's not a letter, number or delimiter boundary.
+	 */
+	private static final Pattern NAME_CHARACTER_FILTER = Pattern.compile("[^\\p{L}\\p{N}\\p{Zs}\\p{Pd}\\p{Pc}]+");
+
+	/**
+	 * Pattern to match word boundaries using delimiters: any combination of spaces & dash/combining punctuation after a letter or number.
+	 */
+	private static final Pattern DELIMITER_BOUNDARY = Pattern.compile("[\\p{Zs}\\p{Pd}\\p{Pc}]+");
+	/**
+	 * Pattern to match any word boundary: the (zero-width) point between a lower- and uppercase letter, or any combination of spaces & punctuation.
+	 */
+	@SuppressWarnings("RegExpSimplifiable") // bug: the suggestion to remove [] from [\p{L}&&\P{Lu}] is wrong
+	private static final Pattern CAMEL_BOUNDARY = Pattern.compile("(?<=[\\p{L}&&\\P{Lu}])(?=\\p{Lu})");
+
+	@NotNull
+	private List<String> splitToWords(Pattern wordBoundary, String text) {
+		List<String> words = new ArrayList<>();
+		Matcher matcher = wordBoundary.matcher(text);
+		int start = 0;
+		while (matcher.find()) {
+			if (start < matcher.start()) {
+				// Only add non-empty words
+				words.add(text.substring(start, matcher.start()));
+			}
+			start = matcher.end();
+		}
+		if (start < text.length()) {
+			// There's text remaining: add it
+			words.add(text.substring(start));
+		}
+		return words;
+	}
+
+	/**
+	 * Operator to apply "proper" to a name part.
+	 */
+	public enum WordCase implements UnaryOperator<String> {
+		/**
+		 * Convert the word to lower case.
+		 */
+		LOWER_CASE {
+			@Override
+			public String apply(String word) {
+				return word.toLowerCase(Locale.ROOT);
+			}
+		},
+		/**
+		 * Convert the word to upper case.
+		 */
+		UPPER_CASE {
+			@Override
+			public String apply(String word) {
+				return word.toUpperCase(Locale.ROOT);
+			}
+		},
+		/**
+		 * Convert the word to lower case, except the first character (convert that to upper case).
+		 */
+		CAPITALIZED {
+			@Override
+			public String apply(String word) {
+				int firstCodePoint = word.codePointAt(0);
+				int sizeOfFirstCharacter = Character.charCount(firstCodePoint);
+				// Use toTitleCase instead of toUpperCase to properly handle digraphs.
+				return Character.toString(Character.toTitleCase(firstCodePoint)) + word.substring(sizeOfFirstCharacter).toLowerCase(Locale.ROOT);
+			}
+		}
+	}
+}