Skip to content

Commit

Permalink
Add naming conventions
Browse files Browse the repository at this point in the history
  • Loading branch information
opwvhk committed Feb 20, 2024
1 parent 8571eb3 commit 513e27c
Show file tree
Hide file tree
Showing 4 changed files with 946 additions and 497 deletions.
82 changes: 70 additions & 12 deletions src/main/java/opwvhk/avro/SchemaManipulator.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,12 @@
package opwvhk.avro;

import net.jimblackler.jsonschemafriend.GenerationException;
import opwvhk.avro.json.SchemaAnalyzer;
import opwvhk.avro.util.AvroSchemaUtils;
import opwvhk.avro.util.NamingConvention;
import opwvhk.avro.xml.XsdAnalyzer;
import org.apache.avro.Schema;

import java.io.IOException;
import java.io.InputStream;
import java.net.URISyntaxException;
Expand All @@ -10,12 +17,7 @@
import java.util.List;
import java.util.Objects;
import java.util.Set;

import net.jimblackler.jsonschemafriend.GenerationException;
import opwvhk.avro.json.SchemaAnalyzer;
import opwvhk.avro.util.AvroSchemaUtils;
import opwvhk.avro.xml.XsdAnalyzer;
import org.apache.avro.Schema;
import java.util.stream.Stream;

import static java.util.Objects.requireNonNull;

Expand All @@ -29,11 +31,18 @@ public class SchemaManipulator {
private boolean renameWithAliases;
private StringBuilder markdownBuffer;
private List<SchemaRenamer> schemaRenamerList;
private SchemaRenamer schemaNamingConvention;
private List<FieldRenamer> fieldRenamerList;
private FieldRenamer fieldNamingConvention;
private List<UnwrapTest> unwrapTests;

private SchemaManipulator(Schema initialSchema) {
reset(initialSchema);
/**
* Create a schema manipulator for a given schema.
*
* @param schema the schema to manipulate
*/
public SchemaManipulator(Schema schema) {
reset(schema);
}

private void reset(Schema initialSchema) {
Expand All @@ -42,7 +51,9 @@ private void reset(Schema initialSchema) {
renameWithAliases = true;
markdownBuffer = null;
schemaRenamerList = new ArrayList<>();
schemaNamingConvention = (pathToField, fieldSchema) -> null;
fieldRenamerList = new ArrayList<>();
fieldNamingConvention = (pathToField, schemaWithField, field) -> null;
unwrapTests = new ArrayList<>();
}

Expand Down Expand Up @@ -71,7 +82,8 @@ public static SchemaManipulator startFromAvro(URL schemaLocation) throws IOExcep
}

/**
* Create a schema manipulator from an XML Schema Definition (XSD). The location of the main {@code .xsd} file is provided, both to provide the XSD content,
* Create a schema manipulator from an XML Schema Definition (XSD). The location of the main {@code .xsd} file is provided, both to provide the XSD
* content,
* as to provide a way to locate imported/included {@code .xsd} files.
*
* @param schemaLocation the location of the main {@code .xsd} file (it may include/import other {@code .xsd} files)
Expand Down Expand Up @@ -210,15 +222,15 @@ private Schema applySchemaChanges(IdentityHashMap<Schema, Schema> changedSchemas
}

private String newSchemaName(String path, Schema schema) {
return schemaRenamerList.stream()
return Stream.concat(schemaRenamerList.stream(), Stream.of(schemaNamingConvention))
.map(renamer -> renamer.newSchemaName(path, schema))
.filter(Objects::nonNull)
.findAny()
.orElse(null);
}

private String newFieldName(String path, Schema schemaWithField, Schema.Field field) {
return fieldRenamerList.stream()
return Stream.concat(fieldRenamerList.stream(), Stream.of(fieldNamingConvention))
.map(renamer -> renamer.newFieldName(path, schemaWithField, field))
.filter(Objects::nonNull)
.findAny()
Expand Down Expand Up @@ -297,6 +309,35 @@ public SchemaManipulator renameSchemaAtPath(String newSchemaName, String... path
return this;
}

/**
* Use the specified naming convention for schemas. This naming convention applies for all schemas that have not been explicitly renamed using
* {@link #renameSchema(String, String)} or {@link #renameSchemaAtPath(String, String...)}, and leaves the namespace name intact.
*
* @param schemaNamingConvention the naming convention to use
* @return this {@code SchemaManipulator}
*/
public SchemaManipulator useSchemaNamingConvention(NamingConvention schemaNamingConvention) {
return useSchemaNamingConvention(NamingConvention.NULL, schemaNamingConvention);
}

/**
* Use the specified naming conventions for schemas. These naming conventions apply for all schemas that have not been explicitly renamed using
* {@link #renameSchema(String, String)} or {@link #renameSchemaAtPath(String, String...)}.
*
* @param namespaceNamingConvention the naming convention to use for the schema namespace
* @param schemaNamingConvention the naming convention to use for the schema (simple) name
* @return this {@code SchemaManipulator}
*/
public SchemaManipulator useSchemaNamingConvention(NamingConvention namespaceNamingConvention, NamingConvention schemaNamingConvention) {
this.schemaNamingConvention = (path, schema) -> {
String namespace = schema.getNamespace();
String prefix = namespace == null ? "" : namespaceNamingConvention.convert(namespace) + ".";
String newFullName = prefix + schemaNamingConvention.convert(schema.getName());
return schema.getFullName().equals(newFullName) ? null : newFullName;
};
return this;
}

/**
* Rename the specified field in the (named) schema.
*
Expand Down Expand Up @@ -330,6 +371,22 @@ public SchemaManipulator renameFieldAtPath(String newFieldName, String... pathTo
return this;
}

/**
* Use the specified naming conventions for fields. This naming convention applies for all fields that have not been explicitly renamed using
* * {@link #renameField(String, String, String)} or {@link #renameFieldAtPath(String, String...)}.
*
* @param namingConvention the naming convention to use for the schema (simple) name
* @return this {@code SchemaManipulator}
*/
public SchemaManipulator useFieldNamingConvention(NamingConvention namingConvention) {
this.fieldNamingConvention = (pathToField, schemaWithField, field) -> {
String oldName = field.name();
String newName = namingConvention.convert(oldName);
return oldName.equals(newName) ? null : newName;
};
return this;
}

/**
* <p>Unwrap all arrays whose field names (except up to the last {@code ignoredMaxSuffixLength} characters) are equal.</p>
*
Expand Down Expand Up @@ -384,7 +441,8 @@ public SchemaManipulator unwrapArray(String schemaName, String wrappingField) {
* <p>Unwrap the array whose wrapping field is at the specified path.</p>
*
* <p>Wrapped arrays are an XML construct. They result in array fields without siblings in a record field (optionally in a union with null). In Avro,
* Parquet, and in fact most/all other formats, they are both not needed and unwanted. This method unwraps them based on the path to the wrapping field.</p>
* Parquet, and in fact most/all other formats, they are both not needed and unwanted. This method unwraps them based on the path to the wrapping field
* .</p>
*
* <p>When unwrapping, wrapped field will replace the wrapping field using the name of the wrapping field. As this is not a renaming action, no alias will
* be added.</p>
Expand Down
238 changes: 238 additions & 0 deletions src/main/java/opwvhk/avro/util/NamingConvention.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,238 @@
/*
* Copyright © Oscar Westra van Holthe - Kind
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/

package opwvhk.avro.util;

import org.jetbrains.annotations.NotNull;

import java.text.Normalizer;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.function.UnaryOperator;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
* <p>A class to provide <a href="https://en.wikipedia.org/wiki/Naming_convention_(programming)#Multiple-word_identifiers">naming conventions for multiple-word
* identifiers</a>, like camel case, snake case, etc.</p>
*
* <p>It is up to the user of the class to ensure the applied naming convention makes sense: using e.g. camel case for a script that has no notion of
* upper/lower case letters is not useful.</p>
*
* <h2>Algorithm</h2>
*
* <p>Casing is applied by first sanitising the text, and determining the list of words. Then the words are put together according to the selected style.</p>
*
* <p>This algorithm is somewhat opinionated: it does not make any special exceptions for acronyms. This is mostly in line with general guidelines, such as
* from <a href="https://www.oracle.com/java/technologies/javase/codeconventions-namingconventions.html">Java/Oracle</a>
* and <a href="https://learn.microsoft.com/en-us/dotnet/standard/design-guidelines/capitalization-conventions">Microsoft</a>, but ignores the Microsoft
* exception for two-letter acronyms (like IO).</p>
*
* <p>Sanitation is done by creating a {@link Normalizer.Form#NFD canonical decomposition}, and then removing everything that is not in the
* <a href="https://www.unicode.org/reports/tr44/#General_Category_Values">unicode categories</a> letter (L), number (N), space separator (Zs), connector
* punctuation (Pc) or dash punctuation (Pd). This also removes accents. Words are then determined by splitting along spacing and punctuation.</p>
*
* <h2>Defined conventions</h2>
*
* <p>There are a number of capitalisation conventions predefined, combining various delimiters and combinations of upper and lower case, as listed below:</p>
*
* <table><caption>Capitalisation Conventions</caption><thead>
* <tr><th>Convention</th><th>Example</th></tr>
* </thead><tbody>
* <tr><td>Pascal Case</td><td>PascalCase</td></tr>
* <tr><td>Camel Case</td><td>camelCase</td></tr>
* <tr><td>Snake Case</td><td>snake_case</td></tr>
* <tr><td>Kebab Case</td><td>kebab-case</td></tr>
* <tr><td>Pascal Snake Case</td><td>Pascal_Snake_Case</td></tr>
* <tr><td>Camel Snake Case</td><td>camel_Snake_Case</td></tr>
* <tr><td>Screaming Snake Case</td><td>SCREAMING_SNAKE_CASE</td></tr>
* <tr><td>Train Case</td><td>Train-Case</td></tr>
* <tr><td>Cobol Case</td><td>COBOL-CASE</td></tr>
* </tbody></table>
*
* <p>Note that there is no predefined convention for (upper) flat case. The reason is that they are not reversible. The other conventions can be applied in any
* sequence, and the last one deterministically determines the result. If any convention in between uses flat case, this is no longer true.</p>
*
* @see <a href="https://en.wikipedia.org/wiki/Naming_convention_(programming)#Multiple-word_identifiers">Naming conventions for multiple-word identifiers</a>
* @see <a href="https://www.oracle.com/java/technologies/javase/codeconventions-namingconventions.html">Java naming convensions</a>
* @see <a href="https://learn.microsoft.com/en-us/dotnet/standard/design-guidelines/capitalization-conventions">Microsoft capitalization conventions</a>
* @see <a href="https://www.unicode.org/reports/tr44/#General_Category_Values">Unicode annex #44, General Category Values</a>
*/
public class NamingConvention {
/**
* Pascal Case: capitalized words without delimiter.
*/
public static final NamingConvention PASCAL_CASE = new NamingConvention("", WordCase.CAPITALIZED, WordCase.CAPITALIZED);
/**
* Camel Case: lowercase first word followed by capitalized words, without delimiter.
*/
public static final NamingConvention CAMEL_CASE = new NamingConvention("", WordCase.LOWER_CASE, WordCase.CAPITALIZED);
/**
* Snake Case: lowercase words, separated by underscores.
*/
public static final NamingConvention SNAKE_CASE = new NamingConvention("_", WordCase.LOWER_CASE, WordCase.LOWER_CASE);
/**
* Kebab Case: lowercase words, separated by hyphens.
*/
public static final NamingConvention KEBAB_CASE = new NamingConvention("-", WordCase.LOWER_CASE, WordCase.LOWER_CASE);
/**
* Pascal Snake Case: capitalized words, separated by underscores.
*/
public static final NamingConvention PASCAL_SNAKE_CASE = new NamingConvention("_", WordCase.CAPITALIZED, WordCase.CAPITALIZED);
/**
* Camel Snake Case: lowercase first word followed by capitalized words, separated by underscores.
*/
public static final NamingConvention CAMEL_SNAKE_CASE = new NamingConvention("_", WordCase.LOWER_CASE, WordCase.CAPITALIZED);
/**
* Screaming Snake Case: uppercase words, separated by underscores.
*/
public static final NamingConvention SCREAMING_SNAKE_CASE = new NamingConvention("_", WordCase.UPPER_CASE, WordCase.UPPER_CASE);
/**
* Train Case: capitalized words, separated by hyphens.
*/
public static final NamingConvention TRAIN_CASE = new NamingConvention("-", WordCase.CAPITALIZED, WordCase.CAPITALIZED);
/**
* Cobol Case: uppercase words, separated by hyphens.
*/
public static final NamingConvention COBOL_CASE = new NamingConvention("-", WordCase.UPPER_CASE, WordCase.UPPER_CASE);
/**
* Dummy naming convention that returns the given name as-is.
*/
public static final NamingConvention NULL = new NamingConvention(null, null, null) {
@Override
public String convert(String name) {
return name;
}
};

private final String delimiter;
private final WordCase firstWord;
private final WordCase otherWords;

/**
* Create a naming convention for multiple-word identifiers. Combining an empty delimiter with {@link WordCase#LOWER_CASE} or
* {@link WordCase#UPPER_CASE} is discouraged, as the result cannot be converted to another naming convention.
*
* @param delimiter the word delimiter to use
* @param firstWord the capitalization for the first word
* @param otherWords the capitalization for the other words
*/
public NamingConvention(String delimiter, WordCase firstWord, WordCase otherWords) {
this.delimiter = delimiter;
this.firstWord = firstWord;
this.otherWords = otherWords;
}

/**
* Convert a text/name to a name in this name case.
*
* @param name the name to convert
* @return the name in this name case
*/
public String convert(String name) {
// First remove accents, extra punctuation, etc. Keep only letters, numbers, and dash & combining punctuation.
String cleanName = NAME_CHARACTER_FILTER.matcher(Normalizer.normalize(name, Normalizer.Form.NFD)).replaceAll("");
// if (cleanName.isEmpty() )

// Then split by boundary characters, and determine the first non-empty word
List<String> words = splitToWords(DELIMITER_BOUNDARY, cleanName);
if (words.isEmpty()) {
throw new IllegalArgumentException("The name contains no letters or numbers");
} else if (words.size() == 1) {
// The name contains no boundary characters: maybe it is camel case.
words = splitToWords(CAMEL_BOUNDARY, cleanName);
}

StringBuilder buffer = new StringBuilder((int) (name.length() * 1.2f));
Iterator<String> iterator = words.iterator();
buffer.append(firstWord.apply(iterator.next()));
iterator.forEachRemaining(word -> buffer.append(delimiter).append(otherWords.apply(word)));
return buffer.toString();
}

/**
* Pattern to match anything that's not a letter, number or delimiter boundary.
*/
private static final Pattern NAME_CHARACTER_FILTER = Pattern.compile("[^\\p{L}\\p{N}\\p{Zs}\\p{Pd}\\p{Pc}]+");

/**
* Pattern to match word boundaries using delimiters: any combination of spaces & dash/combining punctuation after a letter or number.
*/
private static final Pattern DELIMITER_BOUNDARY = Pattern.compile("[\\p{Zs}\\p{Pd}\\p{Pc}]+");
/**
* Pattern to match any word boundary: the (zero-width) point between a lower- and uppercase letter, or any combination of spaces & punctuation.
*/
@SuppressWarnings("RegExpSimplifiable") // bug: the suggestion to remove [] from [\p{L}&&\P{Lu}] is wrong
private static final Pattern CAMEL_BOUNDARY = Pattern.compile("(?<=[\\p{L}&&\\P{Lu}])(?=\\p{Lu})");

@NotNull
private List<String> splitToWords(Pattern wordBoundary, String text) {
List<String> words = new ArrayList<>();
Matcher matcher = wordBoundary.matcher(text);
int start = 0;
while (matcher.find()) {
if (start < matcher.start()) {
// Only add non-empty words
words.add(text.substring(start, matcher.start()));
}
start = matcher.end();
}
if (start < text.length()) {
// There's text remaining: add it
words.add(text.substring(start));
}
return words;
}

/**
* Operator to apply "proper" to a name part.
*/
public enum WordCase implements UnaryOperator<String> {
/**
* Convert the word to lower case.
*/
LOWER_CASE {
@Override
public String apply(String word) {
return word.toLowerCase(Locale.ROOT);
}
},
/**
* Convert the word to upper case.
*/
UPPER_CASE {
@Override
public String apply(String word) {
return word.toUpperCase(Locale.ROOT);
}
},
/**
* Convert the word to lower case, except the first character (convert that to upper case).
*/
CAPITALIZED {
@Override
public String apply(String word) {
int firstCodePoint = word.codePointAt(0);
int sizeOfFirstCharacter = Character.charCount(firstCodePoint);
// Use toTitleCase instead of toUpperCase to properly handle digraphs.
return Character.toString(Character.toTitleCase(firstCodePoint)) + word.substring(sizeOfFirstCharacter).toLowerCase(Locale.ROOT);
}
}
}
}
Loading

0 comments on commit 513e27c

Please sign in to comment.