Skip to content

Commit

Permalink
Utf8FrameValidator: optimize for ASCII content
Browse files Browse the repository at this point in the history
  • Loading branch information
mostroverkhov committed Jul 20, 2024
1 parent badceaf commit 139a5d0
Show file tree
Hide file tree
Showing 2 changed files with 91 additions and 7 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
import java.net.InetSocketAddress;
import java.net.SocketAddress;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
Expand Down Expand Up @@ -272,6 +273,42 @@ void invalidFragmentCompletion() throws Exception {
}
}

@Test
void utf8Validator() {
String ascii = "Are those shy Eurasian footwear, cowboy chaps, or jolly earthmoving headgear";
String utf8 = "Чуєш їх, доцю, га? Кумедна ж ти, прощайся без ґольфів!";
List<ByteBuf> asciiList = stringList(ByteBufAllocator.DEFAULT, ascii);
List<ByteBuf> utf8List = stringList(ByteBufAllocator.DEFAULT, utf8);
try {
WebSocketFrameListener.Utf8FrameValidator validator =
WebSocketFrameListener.Utf8FrameValidator.create();
for (ByteBuf byteBuf : asciiList) {
Assertions.assertThat(validator.validateTextFrame(byteBuf)).isTrue();
}
for (ByteBuf byteBuf : utf8List) {
Assertions.assertThat(validator.validateTextFrame(byteBuf)).isTrue();
}
} finally {
for (ByteBuf byteBuf : asciiList) {
byteBuf.release();
}
for (ByteBuf byteBuf : utf8List) {
byteBuf.release();
}
}
}

static List<ByteBuf> stringList(ByteBufAllocator allocator, String string) {
int length = string.length();
List<ByteBuf> list = new ArrayList<>(length);
for (int i = 0; i < length; i++) {
String substring = string.substring(0, i + 1);
ByteBuf byteBuf = ByteBufUtil.writeUtf8(allocator, substring);
list.add(byteBuf);
}
return list;
}

@Test
void utf8TextFrameValidator() {
ByteBufAllocator alloc = ByteBufAllocator.DEFAULT;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@

import io.netty.buffer.ByteBuf;
import io.netty.channel.ChannelHandlerContext;
import io.netty.util.ByteProcessor;
import io.netty.util.CharsetUtil;

/**
Expand Down Expand Up @@ -76,9 +75,9 @@ public static String reason(ByteBuf payload) {

/**
* UTF8 finite state machine based implementation from
* https://bjoern.hoehrmann.de/utf-8/decoder/dfa/
* https://bjoern.hoehrmann.de/utf-8/decoder/dfa/ optimized for ASCII content.
*/
final class Utf8FrameValidator implements ByteProcessor {
final class Utf8FrameValidator {
public static final int UTF8_VALIDATION_ERROR_CODE = 1007;
public static final String UTF8_VALIDATION_ERROR_MESSAGE =
"inbound text frame with non-utf8 contents";
Expand Down Expand Up @@ -120,7 +119,7 @@ public static Utf8FrameValidator create() {
* @return true if payload is utf8 encoded, false otherwise
*/
public boolean validateTextFrame(ByteBuf buffer) {
buffer.forEachByte(this);
checkUtf8(buffer);
int st = state;
state = UTF8_ACCEPT;
codep = 0;
Expand All @@ -132,7 +131,7 @@ public boolean validateTextFrame(ByteBuf buffer) {
* @return true if payload is utf8 encoded, false otherwise
*/
public boolean validateTextFragmentStart(ByteBuf buffer) {
buffer.forEachByte(this);
checkUtf8(buffer);
return state != UTF8_REJECT;
}

Expand All @@ -152,8 +151,56 @@ public boolean validateFragmentEnd(ByteBuf buffer) {
return validateTextFrame(buffer);
}

@Override
public boolean process(byte bufferByte) {
private void checkUtf8(ByteBuf buffer) {
int readableBytes = buffer.readableBytes();
int from = buffer.readerIndex();
int to = from + readableBytes;
boolean cont = true;
int step = Long.BYTES;
while (to - from >= step) {
long bytes = buffer.getLong(from);
if (
/*is non-ascii*/ (bytes & 0x8080808080808080L) != 0) {
for (int i = 0; i < step; i++) {
byte b = (byte) ((bytes >> 8 * (step - (i + 1))) & 0xFF);
cont = checkUtf8(b);
if (!cont) {
break;
}
}
}
from += step;
}
if (cont) {
step = Integer.BYTES;
while (to - from >= step) {
int bytes = buffer.getInt(from);
if (
/*is non-ascii*/ (bytes & 0x80808080) != 0) {
for (int i = 0; i < step; i++) {
byte b = (byte) ((bytes >> 8 * (step - (i + 1))) & 0xFF);
cont = checkUtf8(b);
if (!cont) {
break;
}
}
}
from += step;
}
}
if (cont) {
while (to - from >= 1) {
byte b = buffer.getByte(from);
cont = checkUtf8(b);
if (!cont) {
break;
}
from += 1;
}
}
}

private boolean checkUtf8(byte bufferByte) {
byte type = TYPES[bufferByte & 0xFF];
int st = state;
codep = st != UTF8_ACCEPT ? bufferByte & 0x3f | codep << 6 : 0xff >> type & bufferByte;
Expand Down

0 comments on commit 139a5d0

Please sign in to comment.