Skip to content

Commit

Permalink
v. 0.5.3 Bug fix of reading UTF-8 fields into Java String objects
Browse files Browse the repository at this point in the history
  • Loading branch information
jjenkov committed Dec 12, 2019
1 parent c24f434 commit 85f0b95
Show file tree
Hide file tree
Showing 6 changed files with 206 additions and 15 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -125,5 +125,6 @@ the next section.

| Version | Java Version | Change |
|---------|--------------|--------|
| 0.5.3 | Java 8 | Bug fix of reading UTF-8 fields into Java Strings. |
| 0.5.2 | Java 8 | First release |

152 changes: 152 additions & 0 deletions notes/tion-notes.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
Field value encodings

/.../ Hex
'...' Base64
"..." UTF-8

(...) Comments

token Single token - UTF-8 encoded - extends until next whitespace character

<...> ?? ( ... default for field ? )



Full field type encodings:

#0 Bytes
#1 Boolean
#2 Int-Pos
#3 Int-Neg
#4 Float
#5 UTF-8
#6 UTF-8 short
#7 UTC
#8 Unused
#9 Unused
#10 Array (A)
#11 Table (B)
#12 Object (C)
#13 Key (D)
#14 Key short (E)
#15 Extended (F)




#Abbreviated field type encodings
& Bytes
! Boolean
+ Int-Pos
- Int-Neg
% float 4
. float 8
? UTF-8
$ UTF-8 short
@ UTC

: Key / Key-short ?

* extended ?

; ??
, ??


#Ommited field types => default field types

/FE45 B14D/ Standalone Hex value defaults to Bytes field
'5345AdoP53' Standalone Base64 value defaults to Bytes field
"Hello World" Standalone UTF-8 value defaults to UTF-8 field
1234545 Standalone single token UTF-8 value defaults to UTF-8



Composite type delimiters

[...] Array
|...| Table
{...} Object



Full field type examples:

#0'ab45Rfwer2343' Bytes field - value encoded as Base64 (inside ' ')
#0/ab45fe21/ Bytes field - value encoded as Hex (inside / /)
#1"true" Boolean field - value encoded as UTF-8
#2"123" Int-Pos field - value encoded as UTF-8
#2123 Int-Pos field - value encoded as UTF-8 token
#3"474" Int-Neg field - value encoded as UTf-8 (actually represents minus 474)
#4"123.456" Float field - value encoded as UTF-8
#5"Hello World" UTF-8 field - value encoded as UTF-8
#6"Hello World" UTF-8 short field - value encoded as UTF-8
#7"2020-01-31" UTC field - value enocded as UTF-8 - ISO standard

#10[ #1"true" #1"false" #1"true" ] Array of Boolean fields

#11| #13"fieldName1" #13"fieldName2" #13"fieldName3" Table with 3 columns (3 key fields = 3 columns)
#2"123" #1"true" #5"value 3"
#2"456" #1"false" #5"value 6"
|

#12{ #13"fieldName1" #2"123" Object with 3 key-value pairs
#13"fieldName2" #1"true"
#13"fieldName3" #5"value 3"
}



Abbreviated field type examples:

&'ab45Rfwer2343' Bytes field - value encoded as Base64 (inside ' ')
&/ab45fe21/ Bytes field - value encoded as Hex (inside / /)
!"true" Boolean field - value encoded as UTF-8
+"123" Int-Pos field - value encoded as UTF-8
+123 Int-Pos field - value encoded as UTF-8 token
-"474" Int-Neg field - value encoded as UTf-8 (actually represents minus 474)
%"123.456" Float field - value encoded as UTF-8
""Hello World" UTF-8 field - value encoded as UTF-8
#6"Hello World" UTF-8 short field - value encoded as UTF-8 (cannot be abbreviated?)
@"2020-01-31" UTC field - value encoded as UTF-8 - ISO standard

[ !"true" !"false" !"true" ] Array of Boolean fields

| $"fieldName1" $"fieldName2" $"fieldName3" Table with 3 columns (3 key fields = 3 columns)
+"123" !"true" ""value 3"
+"456" !"false" ""value 6"
|

{ $"fieldName1" +"123" Object with 3 key-value pairs
$"fieldName2" !"true"
$"fieldName3" ""value 3"
}



Abbreviated field type - abbreviated value enclosure (token) examples:

&ab45Rfwer2343 Bytes field - value encoded as Base64 token
&/ab45fe21/ Bytes field - value encoded as Hex (inside / /)
!true Boolean field - value encoded as UTF-8 token
+123 Int-Pos field - value encoded as UTF-8 token
-474 Int-Neg field - value encoded as UTf-8 token
%123.456 Float field - value encoded as UTF-8 token
?Hello UTF-8 field - value encoded as UTF-8 token
@2020-01-31 UTC field - value enocded as UTF-8 - ISO standard

[ !true !false !true ] Array of Boolean fields

| :fieldName1 :fieldName2 :fieldName3 Table with 3 columns (3 key fields = 3 columns)
+123 !true ?"value 3"
+456 !false ?"value 6"
|

{ :fieldName1 +123 Object with 3 key-value pairs
:fieldName2 !true
:fieldName3 ?tokenValue
}




2 changes: 1 addition & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

<groupId>com.nanosai</groupId>
<artifactId>rion-ops</artifactId>
<version>0.5.2</version>
<version>0.5.3</version>
<packaging>jar</packaging>

<name>RION Ops for Java</name>
Expand Down
7 changes: 4 additions & 3 deletions src/main/java/com/nanosai/rionops/rion/read/RionReader.java
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import com.nanosai.memops.objects.Bytes;
import com.nanosai.rionops.rion.RionFieldTypes;

import java.nio.charset.StandardCharsets;
import java.util.Calendar;
import java.util.GregorianCalendar;
import java.util.TimeZone;
Expand Down Expand Up @@ -331,7 +332,7 @@ public int readUtf8(byte[] dest, int offset, int length){
public String readUtf8String(){
if(this.fieldLengthLength == 0) return null;

return new String(this.source, this.index, this.fieldLength);
return new String(this.source, this.index, this.fieldLength, StandardCharsets.UTF_8);
}

public Calendar readUtcCalendar() {
Expand Down Expand Up @@ -447,7 +448,7 @@ public int readKey(byte[] dest, int offset, int length){
public String readKeyAsUtf8String(){
if(this.fieldLengthLength == 0) return null;

return new String(this.source, this.index, this.fieldLength);
return new String(this.source, this.index, this.fieldLength, StandardCharsets.UTF_8);
}

public int readKeyShort(byte[] dest){
Expand All @@ -469,7 +470,7 @@ public int readKeyShort(byte[] dest, int offset, int length){
public String readKeyShortAsUtf8String(){
if(this.fieldLengthLength == 0) return null;

return new String(this.source, this.index, this.fieldLength);
return new String(this.source, this.index, this.fieldLength, StandardCharsets.UTF_8);
}

public long readKeyShortAsLong() {
Expand Down
27 changes: 22 additions & 5 deletions src/main/java/com/nanosai/rionops/rion/write/RionWriter.java
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,8 @@ public class RionWriter {
public int index = 0;

//todo should these be called "complex" or "nested" ??
private int[] complexFieldStack = null; //used to store start indexes of complex fields that can contain nested fields.
private int complexFieldStackIndex = -1; //start at -1 - will be incremented before first use.
private int[] compositeFieldStack = null; //used to store start indexes of complex fields that can contain nested fields.
private int compositeFieldStackIndex = -1; //start at -1 - will be incremented before first use.

public RionWriter() {
}
Expand Down Expand Up @@ -62,7 +62,7 @@ public RionWriter setOffset(int offset){
}

public RionWriter setNestedFieldStack(int[] stack){
this.complexFieldStack = stack;
this.compositeFieldStack = stack;
return this;
}

Expand Down Expand Up @@ -351,12 +351,12 @@ public void writeObjectEnd(int objectStartIndex, int lengthLength, int length){
}

public void writeObjectBeginPush(int lengthLength){
this.complexFieldStack[++this.complexFieldStackIndex] = this.index;
this.compositeFieldStack[++this.compositeFieldStackIndex] = this.index;
this.dest[this.index++] = (byte) (255 & ((RionFieldTypes.OBJECT << 4) | lengthLength));
this.index += lengthLength;
}
public void writeObjectEndPop(){
int objectStartIndex = this.complexFieldStack[this.complexFieldStackIndex--];
int objectStartIndex = this.compositeFieldStack[this.compositeFieldStackIndex--];
int lengthLength = 15 & (this.dest[objectStartIndex]);
int length = this.index - objectStartIndex - 1 - lengthLength;

Expand Down Expand Up @@ -388,6 +388,23 @@ public void writeTableBegin(int lengthLength, int elementCount){
}
}

// new ...
public void writeTableBeginPush(int lengthLength, int elementCount){
this.compositeFieldStack[++this.compositeFieldStackIndex] = this.index;

this.dest[this.index++] = (byte) (255 & ((RionFieldTypes.TABLE << 4) | lengthLength));
this.index += lengthLength;

int elementCountLengthLength = lengthOfInt64Value(elementCount);
dest[this.index++] = (byte) (255 & ((RionFieldTypes.INT_POS << 4) | elementCountLengthLength));
for(int i=(elementCountLengthLength-1)*8; i >= 0; i-=8){
dest[this.index++] = (byte) (255 & (elementCount >> i));
}
}




public void writeTableEnd(int objectStartIndex, int lengthLength, int length){
objectStartIndex++; //jump over the lead byte of the ION Object field

Expand Down
32 changes: 26 additions & 6 deletions src/test/java/com/nanosai/rionops/rion/read/RionReaderTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -213,30 +213,31 @@ public void testReadUtf8() {
byte[] dest = new byte[10 * 1024];

int index = 0;
index += RionWriter.writeUtf8(source, index, "Hello");
index += RionWriter.writeUtf8(source, index, "Hellå");
index += RionWriter.writeUtf8(source, index, (String) null);

reader.setSource(source, 0, source.length);
reader.parse();

assertEquals(RionFieldTypes.UTF_8_SHORT, reader.fieldType);
assertEquals(5, reader.fieldLength);
assertEquals(6, reader.fieldLength); //Danish character å requires 2 bytes in UTF-8

int length = reader.readUtf8(dest);
assertEquals(5, length);
assertEquals(6, length);
assertEquals('H', dest[0]);
assertEquals('e', dest[1]);
assertEquals('l', dest[2]);
assertEquals('l', dest[3]);
assertEquals('o', dest[4]);
assertEquals(0xc3, 255 & dest[4]);
assertEquals(0xa5, 255 & dest[5]);

length = reader.readUtf8(dest, 1, 3);
assertEquals(3, length);
assertEquals('H', dest[1]);
assertEquals('e', dest[2]);
assertEquals('l', dest[3]);

assertEquals("Hello", reader.readUtf8String());
assertEquals("Hellå", reader.readUtf8String());

reader.next();
reader.parse();
Expand Down Expand Up @@ -402,7 +403,7 @@ public void testReadKey() {
index += RionWriter.writeKey(source, index, (String) null);

reader.setSource(source, 0, source.length);
reader.parse();
reader.nextParse();

assertEquals(RionFieldTypes.KEY, reader.fieldType);
assertEquals(5, reader.fieldLength);
Expand Down Expand Up @@ -432,6 +433,25 @@ public void testReadKey() {
}


@Test
public void readKeyAsUtf8String() {
byte[] source = new byte[10 * 1024];
int index = 0;

index += RionWriter.writeKey(source, index, "Hellå");
index += RionWriter.writeKey(source, index, (String) null);

reader.setSource(source, 0, source.length);
reader.nextParse();

assertEquals(RionFieldTypes.KEY, reader.fieldType);
assertEquals(6, reader.fieldLength);

assertEquals("Hellå", reader.readKeyAsUtf8String());
}



@Test
public void testReadKeyShort() {
byte[] source = new byte[10 * 1024];
Expand Down

0 comments on commit 85f0b95

Please sign in to comment.