Skip to content

Commit fe73d38

Browse files
committed
metadata: Compact integer encoding.
Previously every auto-serialized tags are strongly typed. However this is not strictly required, and instead it can be exploited to provide the optimal encoding for smaller integers. This commit repurposes `EsI8`/`EsU8` through `EsI64`/`EsU64` tags to represent *any* integers with given ranges: It is now possible to encode `42u64` as two bytes `EsU8 0x2a`, for example. There are some limitations: * It does not apply to non-auto-serialized tags for obvious reasons. Fortunately, we have already eliminated the biggest source of such tag in favor of auto-serialized tags: `tag_table_id`. * Bigger tags cannot be used to represent smaller types. * Signed tags and unsigned tags do not mix.
1 parent 36a09a1 commit fe73d38

File tree

1 file changed

+114
-93
lines changed

1 file changed

+114
-93
lines changed

src/librbml/lib.rs

+114-93
Original file line numberDiff line numberDiff line change
@@ -87,39 +87,37 @@ pub enum EbmlEncoderTag {
8787
// tags 00..1f are reserved for auto-serialization.
8888
// first NUM_IMPLICIT_TAGS tags are implicitly sized and lengths are not encoded.
8989

90-
EsUint = 0x00, // + 8 bytes
91-
EsU64 = 0x01, // + 8 bytes
92-
EsU32 = 0x02, // + 4 bytes
93-
EsU16 = 0x03, // + 2 bytes
94-
EsU8 = 0x04, // + 1 byte
95-
EsInt = 0x05, // + 8 bytes
96-
EsI64 = 0x06, // + 8 bytes
97-
EsI32 = 0x07, // + 4 bytes
98-
EsI16 = 0x08, // + 2 bytes
99-
EsI8 = 0x09, // + 1 byte
100-
EsBool = 0x0a, // + 1 byte
101-
EsChar = 0x0b, // + 4 bytes
102-
EsF64 = 0x0c, // + 8 bytes
103-
EsF32 = 0x0d, // + 4 bytes
104-
EsSub8 = 0x0e, // + 1 byte
105-
EsSub32 = 0x0f, // + 4 bytes
106-
107-
EsStr = 0x10,
108-
EsEnum = 0x11, // encodes the variant id as the first EsSub*
109-
EsVec = 0x12, // encodes the # of elements as the first EsSub*
110-
EsVecElt = 0x13,
111-
EsMap = 0x14, // encodes the # of pairs as the first EsSub*
112-
EsMapKey = 0x15,
113-
EsMapVal = 0x16,
114-
EsOpaque = 0x17,
90+
EsU64 = 0x00, // + 8 bytes
91+
EsU32 = 0x01, // + 4 bytes
92+
EsU16 = 0x02, // + 2 bytes
93+
EsU8 = 0x03, // + 1 byte
94+
EsI64 = 0x04, // + 8 bytes
95+
EsI32 = 0x05, // + 4 bytes
96+
EsI16 = 0x06, // + 2 bytes
97+
EsI8 = 0x07, // + 1 byte
98+
EsBool = 0x08, // + 1 byte
99+
EsChar = 0x09, // + 4 bytes
100+
EsF64 = 0x0a, // + 8 bytes
101+
EsF32 = 0x0b, // + 4 bytes
102+
EsSub8 = 0x0c, // + 1 byte
103+
EsSub32 = 0x0d, // + 4 bytes
104+
105+
EsStr = 0x0e,
106+
EsEnum = 0x0f, // encodes the variant id as the first EsSub*
107+
EsVec = 0x10, // encodes the # of elements as the first EsSub*
108+
EsVecElt = 0x11,
109+
EsMap = 0x12, // encodes the # of pairs as the first EsSub*
110+
EsMapKey = 0x13,
111+
EsMapVal = 0x14,
112+
EsOpaque = 0x15,
115113
}
116114

117115
const NUM_TAGS: uint = 0x1000;
118-
const NUM_IMPLICIT_TAGS: uint = 0x10;
116+
const NUM_IMPLICIT_TAGS: uint = 0x0e;
119117

120118
static TAG_IMPLICIT_LEN: [i8; NUM_IMPLICIT_TAGS] = [
121-
8, 8, 4, 2, 1, // EsU*
122-
8, 8, 4, 2, 1, // ESI*
119+
8, 4, 2, 1, // EsU*
120+
8, 4, 2, 1, // ESI*
123121
1, // EsBool
124122
4, // EsChar
125123
8, 4, // EsF*
@@ -154,9 +152,9 @@ pub mod reader {
154152
use serialize;
155153

156154
use super::{ ApplicationError, EsVec, EsMap, EsEnum, EsSub8, EsSub32,
157-
EsVecElt, EsMapKey, EsU64, EsU32, EsU16, EsU8, EsInt, EsI64,
155+
EsVecElt, EsMapKey, EsU64, EsU32, EsU16, EsU8, EsI64,
158156
EsI32, EsI16, EsI8, EsBool, EsF64, EsF32, EsChar, EsStr, EsMapVal,
159-
EsUint, EsOpaque, EbmlEncoderTag, Doc, TaggedDoc,
157+
EsOpaque, EbmlEncoderTag, Doc, TaggedDoc,
160158
Error, IntTooBig, InvalidTag, Expected, NUM_IMPLICIT_TAGS, TAG_IMPLICIT_LEN };
161159

162160
pub type DecodeResult<T> = Result<T, Error>;
@@ -420,37 +418,6 @@ pub mod reader {
420418
Ok(r_doc)
421419
}
422420

423-
fn next_doc2(&mut self,
424-
exp_tag1: EbmlEncoderTag,
425-
exp_tag2: EbmlEncoderTag) -> DecodeResult<(bool, Doc<'doc>)> {
426-
assert!((exp_tag1 as uint) != (exp_tag2 as uint));
427-
debug!(". next_doc2(exp_tag1={:?}, exp_tag2={:?})", exp_tag1, exp_tag2);
428-
if self.pos >= self.parent.end {
429-
return Err(Expected(format!("no more documents in \
430-
current node!")));
431-
}
432-
let TaggedDoc { tag: r_tag, doc: r_doc } =
433-
try!(doc_at(self.parent.data, self.pos));
434-
debug!("self.parent={:?}-{:?} self.pos={:?} r_tag={:?} r_doc={:?}-{:?}",
435-
self.parent.start,
436-
self.parent.end,
437-
self.pos,
438-
r_tag,
439-
r_doc.start,
440-
r_doc.end);
441-
if r_tag != (exp_tag1 as uint) && r_tag != (exp_tag2 as uint) {
442-
return Err(Expected(format!("expected EBML doc with tag {:?} or {:?} but \
443-
found tag {:?}", exp_tag1, exp_tag2, r_tag)));
444-
}
445-
if r_doc.end > self.parent.end {
446-
return Err(Expected(format!("invalid EBML, child extends to \
447-
{:#x}, parent to {:#x}",
448-
r_doc.end, self.parent.end)));
449-
}
450-
self.pos = r_doc.end;
451-
Ok((r_tag == (exp_tag2 as uint), r_doc))
452-
}
453-
454421
fn push_doc<T, F>(&mut self, exp_tag: EbmlEncoderTag, f: F) -> DecodeResult<T> where
455422
F: FnOnce(&mut Decoder<'doc>) -> DecodeResult<T>,
456423
{
@@ -471,16 +438,59 @@ pub mod reader {
471438
return Ok(0);
472439
}
473440

474-
let (big, doc) = try!(self.next_doc2(EsSub8, EsSub32));
475-
let r = if big {
476-
doc_as_u32(doc) as uint
441+
let TaggedDoc { tag: r_tag, doc: r_doc } =
442+
try!(doc_at(self.parent.data, self.pos));
443+
let r = if r_tag == (EsSub8 as uint) {
444+
doc_as_u8(r_doc) as uint
445+
} else if r_tag == (EsSub32 as uint) {
446+
doc_as_u32(r_doc) as uint
477447
} else {
478-
doc_as_u8(doc) as uint
448+
return Err(Expected(format!("expected EBML doc with tag {:?} or {:?} but \
449+
found tag {:?}", EsSub8, EsSub32, r_tag)));
479450
};
451+
if r_doc.end > self.parent.end {
452+
return Err(Expected(format!("invalid EBML, child extends to \
453+
{:#x}, parent to {:#x}",
454+
r_doc.end, self.parent.end)));
455+
}
456+
self.pos = r_doc.end;
480457
debug!("_next_sub result={:?}", r);
481458
Ok(r)
482459
}
483460

461+
// variable-length unsigned integer with different tags
462+
fn _next_int(&mut self,
463+
first_tag: EbmlEncoderTag,
464+
last_tag: EbmlEncoderTag) -> DecodeResult<u64> {
465+
if self.pos >= self.parent.end {
466+
return Err(Expected(format!("no more documents in \
467+
current node!")));
468+
}
469+
470+
let TaggedDoc { tag: r_tag, doc: r_doc } =
471+
try!(doc_at(self.parent.data, self.pos));
472+
let r = if first_tag as uint <= r_tag && r_tag <= last_tag as uint {
473+
match last_tag as uint - r_tag {
474+
0 => doc_as_u8(r_doc) as u64,
475+
1 => doc_as_u16(r_doc) as u64,
476+
2 => doc_as_u32(r_doc) as u64,
477+
3 => doc_as_u64(r_doc) as u64,
478+
_ => unreachable!(),
479+
}
480+
} else {
481+
return Err(Expected(format!("expected EBML doc with tag {:?} through {:?} but \
482+
found tag {:?}", first_tag, last_tag, r_tag)));
483+
};
484+
if r_doc.end > self.parent.end {
485+
return Err(Expected(format!("invalid EBML, child extends to \
486+
{:#x}, parent to {:#x}",
487+
r_doc.end, self.parent.end)));
488+
}
489+
self.pos = r_doc.end;
490+
debug!("_next_int({:?}, {:?}) result={:?}", first_tag, last_tag, r);
491+
Ok(r)
492+
}
493+
484494
pub fn read_opaque<R, F>(&mut self, op: F) -> DecodeResult<R> where
485495
F: FnOnce(&mut Decoder, Doc) -> DecodeResult<R>,
486496
{
@@ -502,33 +512,25 @@ pub mod reader {
502512
type Error = Error;
503513
fn read_nil(&mut self) -> DecodeResult<()> { Ok(()) }
504514

505-
fn read_u64(&mut self) -> DecodeResult<u64> { Ok(doc_as_u64(try!(self.next_doc(EsU64)))) }
506-
fn read_u32(&mut self) -> DecodeResult<u32> { Ok(doc_as_u32(try!(self.next_doc(EsU32)))) }
507-
fn read_u16(&mut self) -> DecodeResult<u16> { Ok(doc_as_u16(try!(self.next_doc(EsU16)))) }
508-
fn read_u8 (&mut self) -> DecodeResult<u8 > { Ok(doc_as_u8 (try!(self.next_doc(EsU8 )))) }
515+
fn read_u64(&mut self) -> DecodeResult<u64> { self._next_int(EsU64, EsU8) }
516+
fn read_u32(&mut self) -> DecodeResult<u32> { Ok(try!(self._next_int(EsU32, EsU8)) as u32) }
517+
fn read_u16(&mut self) -> DecodeResult<u16> { Ok(try!(self._next_int(EsU16, EsU8)) as u16) }
518+
fn read_u8(&mut self) -> DecodeResult<u8> { Ok(doc_as_u8(try!(self.next_doc(EsU8)))) }
509519
fn read_uint(&mut self) -> DecodeResult<uint> {
510-
let v = doc_as_u64(try!(self.next_doc(EsUint)));
520+
let v = try!(self._next_int(EsU64, EsU8));
511521
if v > (::std::usize::MAX as u64) {
512522
Err(IntTooBig(v as uint))
513523
} else {
514524
Ok(v as uint)
515525
}
516526
}
517527

518-
fn read_i64(&mut self) -> DecodeResult<i64> {
519-
Ok(doc_as_u64(try!(self.next_doc(EsI64))) as i64)
520-
}
521-
fn read_i32(&mut self) -> DecodeResult<i32> {
522-
Ok(doc_as_u32(try!(self.next_doc(EsI32))) as i32)
523-
}
524-
fn read_i16(&mut self) -> DecodeResult<i16> {
525-
Ok(doc_as_u16(try!(self.next_doc(EsI16))) as i16)
526-
}
527-
fn read_i8 (&mut self) -> DecodeResult<i8> {
528-
Ok(doc_as_u8(try!(self.next_doc(EsI8 ))) as i8)
529-
}
528+
fn read_i64(&mut self) -> DecodeResult<i64> { Ok(try!(self._next_int(EsI64, EsI8)) as i64) }
529+
fn read_i32(&mut self) -> DecodeResult<i32> { Ok(try!(self._next_int(EsI32, EsI8)) as i32) }
530+
fn read_i16(&mut self) -> DecodeResult<i16> { Ok(try!(self._next_int(EsI16, EsI8)) as i16) }
531+
fn read_i8(&mut self) -> DecodeResult<i8> { Ok(doc_as_u8(try!(self.next_doc(EsI8))) as i8) }
530532
fn read_int(&mut self) -> DecodeResult<int> {
531-
let v = doc_as_u64(try!(self.next_doc(EsInt))) as i64;
533+
let v = try!(self._next_int(EsI64, EsI8)) as i64;
532534
if v > (isize::MAX as i64) || v < (isize::MIN as i64) {
533535
debug!("FIXME \\#6122: Removing this makes this function miscompile");
534536
Err(IntTooBig(v as uint))
@@ -739,10 +741,11 @@ pub mod writer {
739741
use std::old_io::{Writer, Seek};
740742
use std::old_io;
741743
use std::slice::bytes;
744+
use std::num::ToPrimitive;
742745

743746
use super::{ EsVec, EsMap, EsEnum, EsSub8, EsSub32, EsVecElt, EsMapKey,
744-
EsU64, EsU32, EsU16, EsU8, EsInt, EsI64, EsI32, EsI16, EsI8,
745-
EsBool, EsF64, EsF32, EsChar, EsStr, EsMapVal, EsUint,
747+
EsU64, EsU32, EsU16, EsU8, EsI64, EsI32, EsI16, EsI8,
748+
EsBool, EsF64, EsF32, EsChar, EsStr, EsMapVal,
746749
EsOpaque, NUM_IMPLICIT_TAGS, NUM_TAGS };
747750
use super::io::SeekableMemWriter;
748751

@@ -1010,32 +1013,50 @@ pub mod writer {
10101013
}
10111014

10121015
fn emit_uint(&mut self, v: uint) -> EncodeResult {
1013-
self.wr_tagged_raw_u64(EsUint as uint, v as u64)
1016+
self.emit_u64(v as u64)
10141017
}
10151018
fn emit_u64(&mut self, v: u64) -> EncodeResult {
1016-
self.wr_tagged_raw_u64(EsU64 as uint, v)
1019+
match v.to_u32() {
1020+
Some(v) => self.emit_u32(v),
1021+
None => self.wr_tagged_raw_u64(EsU64 as uint, v)
1022+
}
10171023
}
10181024
fn emit_u32(&mut self, v: u32) -> EncodeResult {
1019-
self.wr_tagged_raw_u32(EsU32 as uint, v)
1025+
match v.to_u16() {
1026+
Some(v) => self.emit_u16(v),
1027+
None => self.wr_tagged_raw_u32(EsU32 as uint, v)
1028+
}
10201029
}
10211030
fn emit_u16(&mut self, v: u16) -> EncodeResult {
1022-
self.wr_tagged_raw_u16(EsU16 as uint, v)
1031+
match v.to_u8() {
1032+
Some(v) => self.emit_u8(v),
1033+
None => self.wr_tagged_raw_u16(EsU16 as uint, v)
1034+
}
10231035
}
10241036
fn emit_u8(&mut self, v: u8) -> EncodeResult {
10251037
self.wr_tagged_raw_u8(EsU8 as uint, v)
10261038
}
10271039

10281040
fn emit_int(&mut self, v: int) -> EncodeResult {
1029-
self.wr_tagged_raw_i64(EsInt as uint, v as i64)
1041+
self.emit_i64(v as i64)
10301042
}
10311043
fn emit_i64(&mut self, v: i64) -> EncodeResult {
1032-
self.wr_tagged_raw_i64(EsI64 as uint, v)
1044+
match v.to_i32() {
1045+
Some(v) => self.emit_i32(v),
1046+
None => self.wr_tagged_raw_i64(EsI64 as uint, v)
1047+
}
10331048
}
10341049
fn emit_i32(&mut self, v: i32) -> EncodeResult {
1035-
self.wr_tagged_raw_i32(EsI32 as uint, v)
1050+
match v.to_i16() {
1051+
Some(v) => self.emit_i16(v),
1052+
None => self.wr_tagged_raw_i32(EsI32 as uint, v)
1053+
}
10361054
}
10371055
fn emit_i16(&mut self, v: i16) -> EncodeResult {
1038-
self.wr_tagged_raw_i16(EsI16 as uint, v)
1056+
match v.to_i8() {
1057+
Some(v) => self.emit_i8(v),
1058+
None => self.wr_tagged_raw_i16(EsI16 as uint, v)
1059+
}
10391060
}
10401061
fn emit_i8(&mut self, v: i8) -> EncodeResult {
10411062
self.wr_tagged_raw_i8(EsI8 as uint, v)

0 commit comments

Comments
 (0)