Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add text parser. #11

Merged
merged 15 commits into from
Mar 10, 2020
29 changes: 27 additions & 2 deletions src/error.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
use num_bigint::BigInt;
use thiserror::Error;

pub type Result<T> = std::result::Result<T, Error>;
Expand Down Expand Up @@ -67,6 +68,28 @@ pub enum BinaryFormatError {
StructUnordered,
#[error("invalid local symbol table")]
LocalTable,
#[error("time component out of range: {0} - {1}")]
TimeComponentRange(TimeComponent, BigInt),
}

#[derive(Error, Debug, PartialEq)]
pub enum TimeComponent {
#[error("offset")]
Offset,
#[error("year")]
Year,
#[error("month")]
Month,
#[error("day")]
Day,
#[error("hour")]
Hour,
#[error("minute")]
Minute,
#[error("second")]
Second,
#[error("fraction")]
Fraction,
}

#[derive(Error, Debug, PartialEq)]
Expand All @@ -77,8 +100,8 @@ pub enum TextFormatError {
OpenShortString,
#[error("unterminated long quoted string")]
OpenLongString,
#[error("invalid bigint")]
BigUint,
#[error("invalid biguint: {0}")]
BigUint(String),
#[error("invalid bigint: {0}")]
BigInt(String),
#[error("unable to decode Base64 value")]
Expand All @@ -91,4 +114,6 @@ pub enum TextFormatError {
UnsupportedVersion(u32, u32),
#[error("Ion Version Marker could not be parsed (int component too big)")]
IvmParseError,
#[error("Date is too imprecise for time value presence")]
ImpreciseDate,
}
75 changes: 64 additions & 11 deletions src/parser/ion_1_0/binary.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
use super::current_symbol_table::*;
use super::subfield::*;
use super::typed_value::*;
use crate::error::{BinaryFormatError, FormatError};
use crate::parser::ion_1_0::current_symbol_table::CurrentSymbolTable;
use crate::parser::parse_error::{IonError, IonResult};
use crate::symbols::{SymbolToken, SYSTEM_SYMBOL_TABLE_V1};
use crate::value::{Blob, Clob, Data, Decimal, List, Sexp, Struct, Timestamp, Value};
use super::{current_symbol_table::*, subfield::*, typed_value::*};
use crate::{
error::{BinaryFormatError, FormatError, TimeComponent},
parser::{
ion_1_0::current_symbol_table::CurrentSymbolTable,
parse_error::{IonError, IonResult},
},
symbols::{SymbolToken, SYSTEM_SYMBOL_TABLE_V1},
value::{Blob, Clob, Data, Decimal, List, Sexp, Struct, Timestamp, Value},
};
use itertools::Itertools;
use nom::{
combinator::{all_consuming, complete},
Expand All @@ -15,9 +17,8 @@ use nom::{
sequence::pair,
Err,
};
use num_bigint::{BigInt, BigUint, Sign};
use num_traits::identities::Zero;
use num_traits::ToPrimitive;
use num_bigint::{BigInt, BigUint, Sign, ToBigInt};
use num_traits::{identities::Zero, ToPrimitive};

type ParseResult<I, T> = Result<T, Err<IonError<I>>>;

Expand Down Expand Up @@ -422,18 +423,42 @@ fn parse_decimal(typed_value: TypedValue) -> ParseResult<&[u8], Option<Decimal>>
/// text encoding are in the local time! This means that transcoding requires a conversion between
/// UTC and local time.
fn parse_timestamp(typed_value: TypedValue) -> ParseResult<&[u8], Option<Timestamp>> {
fn time_error(i: &[u8], component: TimeComponent, value: BigInt) -> nom::Err<IonError<&[u8]>> {
Err::Failure(IonError::from_format_error(
i,
FormatError::Binary(BinaryFormatError::TimeComponentRange(component, value)),
))
}

match typed_value.length_code {
LengthCode::L15 => Ok(None),
_ => {
let (rest, offset) = take_var_int(typed_value.rep)?;
let offset = offset
.to_i32()
.ok_or_else(|| time_error(typed_value.index, TimeComponent::Offset, offset))?;
siler marked this conversation as resolved.
Show resolved Hide resolved
let (rest, year) = take_var_uint(rest)?;
let year = year.to_u16().ok_or_else(|| {
time_error(
typed_value.index,
TimeComponent::Year,
year.to_bigint().unwrap(),
)
})?;

// Parsing complete with precision of Year
if rest.is_empty() {
return Ok(Some(Timestamp::Year { offset, year }));
}

let (rest, month) = take_var_uint(rest)?;
let month = month.to_u8().ok_or_else(|| {
time_error(
typed_value.index,
TimeComponent::Month,
month.to_bigint().unwrap(),
)
})?;

// Parsing complete with precision of Month
if rest.is_empty() {
Expand All @@ -445,6 +470,13 @@ fn parse_timestamp(typed_value: TypedValue) -> ParseResult<&[u8], Option<Timesta
}

let (rest, day) = take_var_uint(rest)?;
let day = day.to_u8().ok_or_else(|| {
time_error(
typed_value.index,
TimeComponent::Day,
day.to_bigint().unwrap(),
)
})?;

// Parsing complete with precision of Day
if rest.is_empty() {
Expand All @@ -457,7 +489,21 @@ fn parse_timestamp(typed_value: TypedValue) -> ParseResult<&[u8], Option<Timesta
}

let (rest, hour) = take_var_uint(rest)?;
let hour = hour.to_u8().ok_or_else(|| {
time_error(
typed_value.index,
TimeComponent::Hour,
hour.to_bigint().unwrap(),
)
})?;
let (rest, minute) = take_var_uint(rest)?;
let minute = minute.to_u8().ok_or_else(|| {
time_error(
typed_value.index,
TimeComponent::Minute,
minute.to_bigint().unwrap(),
)
})?;

// Parsing complete with precision of Minute
if rest.is_empty() {
Expand All @@ -472,6 +518,13 @@ fn parse_timestamp(typed_value: TypedValue) -> ParseResult<&[u8], Option<Timesta
}

let (rest, second) = take_var_uint(rest)?;
let second = second.to_u8().ok_or_else(|| {
time_error(
typed_value.index,
TimeComponent::Second,
second.to_bigint().unwrap(),
)
})?;

// Parsing complete with precision of Second
if rest.is_empty() {
Expand Down
99 changes: 55 additions & 44 deletions src/parser/ion_1_0/text/mod.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
#![warn(dead_code, unused_variables)]
#[cfg(test)]
mod tests;
mod time;

use self::time::{TextDate, TextTime, TextTimestamp};
use crate::{
error::{FormatError, SymbolError, TextFormatError},
parser::{
Expand All @@ -12,6 +14,7 @@ use crate::{
symbols::SymbolToken,
value::{self as ion},
};
use ::time::UtcOffset;
use log::warn;
use nom::{
self,
Expand All @@ -30,7 +33,7 @@ use nom::{
AsBytes, AsChar, Compare, Err, ExtendInto, InputIter, InputLength, InputTake,
InputTakeAtPosition, Offset, Slice,
};
use num_bigint::{BigInt, Sign};
use num_bigint::{BigInt, BigUint, Sign};
use num_traits::{pow, Num, One, Zero};
use std::{
cell::RefCell,
Expand All @@ -42,7 +45,6 @@ use std::{
rc::Rc,
str::{self, from_utf8},
};
use time::UtcOffset;

/// Follows the following documents:
/// Ion Text Encoding: http://amzn.github.io/ion-docs/docs/text.html
Expand Down Expand Up @@ -1226,18 +1228,17 @@ fn take_bool(i: &str) -> IonResult<&str, bool> {
/// | YEAR 'T'
/// ;
fn take_timestamp(i: &str) -> IonResult<&str, ion::Timestamp> {
let (i, timestamp) = map(
let (i, timestamp) = map_res(
alt((
map_res(
pair(take_date, opt(preceded(one_of("tT"), opt(take_time)))),
|((year, month, day), maybe_time)| match ion::Date::day(year, month, day) {
Ok(date) => {
if let Some(time) = maybe_time {
Ok(ion::TextTimestamp::new(date, time))
} else {
Ok(ion::TextTimestamp::new(date, None))
|((year, month, day), maybe_time)| match TextDate::day(year as u16, month, day) {
Ok(date) => match maybe_time {
Some(Some((time, offset))) => {
Ok(TextTimestamp::new(date, Some(time), offset))
}
}
_ => Ok(time::TextTimestamp::new(date, None, UtcOffset::UTC)),
},
Err(e) => Err(e),
},
),
Expand All @@ -1247,13 +1248,15 @@ fn take_timestamp(i: &str) -> IonResult<&str, ion::Timestamp> {
separated_pair(take_year, char('-'), take_month),
one_of("tT"),
),
|(year, month)| ion::TextTimestamp::new(ion::Date::Month { year, month }, None),
|(year, month)| {
TextTimestamp::new(TextDate::month(year as u16, month), None, UtcOffset::UTC)
},
),
map(terminated(take_year, one_of("tT")), |year| {
ion::TextTimestamp::new(ion::Date::Year { year }, None)
TextTimestamp::new(TextDate::year(year as u16), None, UtcOffset::UTC)
}),
)),
ion::Timestamp::Text,
ion::Timestamp::try_from,
)(i)?;

Ok((i, timestamp))
Expand Down Expand Up @@ -1350,34 +1353,28 @@ fn take_hour_and_minute(i: &str) -> IonResult<&str, (u8, u8)> {
separated_pair(take_hour, char(COLON), take_minute)(i)
}

fn assemble_time_hm(hour: u8, minute: u8, offset: UtcOffset) -> ion::Time {
ion::Time::Minute {
hour,
minute,
offset,
}
fn assemble_time_hm(hour: u8, minute: u8) -> TextTime {
TextTime::Minute { hour, minute }
}

fn assemble_time_hms(
hour: u8,
minute: u8,
second: u8,
maybe_fractional: Option<BigInt>,
offset: UtcOffset,
) -> ion::Time {
match maybe_fractional {
Some(fractional) => ion::Time::FractionalSecond {
maybe_fraction: Option<(BigUint, i32)>,
) -> TextTime {
match maybe_fraction {
Some((fraction_coefficient, fraction_exponent)) => TextTime::FractionalSecond {
hour,
minute,
second,
fractional,
offset,
fraction_coefficient,
fraction_exponent,
},
None => ion::Time::Second {
None => TextTime::Second {
hour,
minute,
second,
offset,
},
}
}
Expand All @@ -1386,19 +1383,19 @@ fn assemble_time_hms(
/// TIME
/// : HOUR ':' MINUTE (':' SECOND)? OFFSET
/// ;
fn take_time(i: &str) -> IonResult<&str, ion::Time> {
fn take_time(i: &str) -> IonResult<&str, (TextTime, UtcOffset)> {
let (i, ((hour, minute), second, offset)) = tuple((
take_hour_and_minute,
opt(preceded(char(COLON), take_second)),
take_offset,
))(i)?;

let time = match second {
Some((second, fractional)) => assemble_time_hms(hour, minute, second, fractional, offset),
None => assemble_time_hm(hour, minute, offset),
Some((second, fraction)) => assemble_time_hms(hour, minute, second, fraction),
None => assemble_time_hm(hour, minute),
};

Ok((i, time))
Ok((i, (time, offset)))
}

/// fragment
Expand Down Expand Up @@ -1452,21 +1449,27 @@ fn take_minute(i: &str) -> IonResult<&str, u8> {
)(i)
}

type FractionalSecond = (BigUint, i32);

/// note that W3C spec requires a digit after the '.'
/// fragment
/// SECOND
/// : [0-5] DEC_DIGIT ('.' DEC_DIGIT+)?
/// ;
fn take_second(i: &str) -> IonResult<&str, (u8, Option<BigInt>)> {
let (i, s) = recognize(pair(one_of("012345"), one_if(is_dec_digit)))(i)?;
let (i, f) = opt(preceded(char('.'), take_while1(is_dec_digit)))(i)?;
let seconds = s
fn take_second(i: &str) -> IonResult<&str, (u8, Option<FractionalSecond>)> {
let (i, seconds) = recognize(pair(one_of("012345"), one_if(is_dec_digit)))(i)?;
let (i, seconds_decimal) = opt(preceded(char('.'), take_while1(is_dec_digit)))(i)?;
let seconds = seconds
.parse::<u8>()
.expect("parser verified seconds should be valid u8");
if let Some(f) = f {
let fractional =
str_to_bigint(f, 10).map_err(|e| Err::Failure(IonError::from_format_error(i, e)))?;
Ok((i, (seconds, Some(fractional))))
if let Some(decimal) = seconds_decimal {
let fraction_exponent = -(decimal.len() as i32);
let fraction_coefficient = str_to_biguint(decimal, 10)
.map_err(|e| Err::Failure(IonError::from_format_error(i, e)))?;
Ok((
i,
(seconds, Some((fraction_coefficient, fraction_exponent))),
))
} else {
Ok((i, (seconds, None)))
}
Expand All @@ -1476,9 +1479,17 @@ fn take_second(i: &str) -> IonResult<&str, (u8, Option<BigInt>)> {
/// Encoding Section: Ion Int
///

/// Helper for turning Vec<&str>s into BigInts. Or failing miserably.
/// TODO: this should not pretend to be a parser, it should be mapped over parse results
/// see take_keyword_entity for an example
/// Helper for turning &str-ish values into BigInts.
fn str_to_biguint<T: AsRef<str>>(digits: T, radix: u32) -> Result<BigUint, FormatError> {
match BigUint::from_str_radix(digits.as_ref(), radix) {
Ok(biguint) => Ok(biguint),
Err(_) => Err(FormatError::Text(TextFormatError::BigUint(
digits.as_ref().to_string(),
))),
}
}

/// Helper for turning &str-ish values into BigInts.
fn str_to_bigint<T: AsRef<str>>(digits: T, radix: u32) -> Result<BigInt, FormatError> {
match BigInt::from_str_radix(digits.as_ref(), radix) {
Ok(bigint) => Ok(bigint),
Expand All @@ -1488,7 +1499,7 @@ fn str_to_bigint<T: AsRef<str>>(digits: T, radix: u32) -> Result<BigInt, FormatE
}
}

/// Helper for turning Vec<&str>s into BigUints. Or failing miserably.
/// Helper for turning Vec<&str>s into BigInts.
fn str_vec_to_bigint(vec: Vec<&str>, radix: u32) -> Result<BigInt, FormatError> {
let digits: String = vec.concat();
Ok(str_to_bigint(digits, radix)?)
Expand Down
Loading