-
Notifications
You must be signed in to change notification settings - Fork 1.8k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[Enhancement] Ignore union type tag when converting avro to json (bac…
…kport #52973) (#53100) Co-authored-by: wyb <[email protected]>
- Loading branch information
1 parent
6a77576
commit 4362c8d
Showing
4 changed files
with
669 additions
and
40 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -14,6 +14,10 @@ | |
|
||
#include "binary_column.h" | ||
|
||
#include <rapidjson/document.h> | ||
#include <rapidjson/stringbuffer.h> | ||
#include <rapidjson/writer.h> | ||
|
||
#include "column/binary_column.h" | ||
#include "column/json_column.h" | ||
#include "common/status.h" | ||
|
@@ -199,15 +203,238 @@ static Status add_column_with_boolean_value(BinaryColumn* column, const TypeDesc | |
return Status::OK(); | ||
} | ||
|
||
static Status avro_value_to_rapidjson(const avro_value_t& value, rapidjson::Document::AllocatorType& allocator, | ||
rapidjson::Value& out) { | ||
switch (avro_value_get_type(&value)) { | ||
case AVRO_STRING: { | ||
const char* in; | ||
size_t size; | ||
if (avro_value_get_string(&value, &in, &size) != 0) { | ||
return Status::InvalidArgument(strings::Substitute("Get string value error $0", avro_strerror())); | ||
} | ||
out.SetString(in, allocator); | ||
return Status::OK(); | ||
} | ||
case AVRO_BYTES: { | ||
const char* in; | ||
size_t size; | ||
if (avro_value_get_fixed(&value, (const void**)&in, &size) != 0) { | ||
return Status::InvalidArgument(strings::Substitute("Get string value error $0", avro_strerror())); | ||
} | ||
out.SetString(in, allocator); | ||
return Status::OK(); | ||
} | ||
case AVRO_INT32: { | ||
int32_t in; | ||
if (avro_value_get_int(&value, &in) != 0) { | ||
return Status::InvalidArgument(strings::Substitute("Get int32 value error $0", avro_strerror())); | ||
} | ||
out.SetInt(in); | ||
return Status::OK(); | ||
} | ||
case AVRO_INT64: { | ||
int64_t in; | ||
if (avro_value_get_long(&value, &in) != 0) { | ||
return Status::InvalidArgument(strings::Substitute("Get int64 value error $0", avro_strerror())); | ||
} | ||
out.SetInt64(in); | ||
return Status::OK(); | ||
} | ||
case AVRO_FLOAT: { | ||
float in; | ||
if (avro_value_get_float(&value, &in) != 0) { | ||
return Status::InvalidArgument(strings::Substitute("Get float value error $0", avro_strerror())); | ||
} | ||
out.SetFloat(in); | ||
return Status::OK(); | ||
} | ||
case AVRO_DOUBLE: { | ||
double in; | ||
if (avro_value_get_double(&value, &in) != 0) { | ||
return Status::InvalidArgument(strings::Substitute("Get double value error $0", avro_strerror())); | ||
} | ||
out.SetDouble(in); | ||
return Status::OK(); | ||
} | ||
case AVRO_BOOLEAN: { | ||
int in; | ||
if (avro_value_get_boolean(&value, &in) != 0) { | ||
return Status::InvalidArgument(strings::Substitute("Get boolean value error $0", avro_strerror())); | ||
} | ||
out.SetBool(in); | ||
return Status::OK(); | ||
} | ||
case AVRO_NULL: { | ||
out.SetNull(); | ||
return Status::OK(); | ||
} | ||
case AVRO_RECORD: { | ||
size_t field_count = 0; | ||
if (avro_value_get_size(&value, &field_count) != 0) { | ||
return Status::InvalidArgument(strings::Substitute("Get record field count error $0", avro_strerror())); | ||
} | ||
|
||
out.SetObject(); | ||
for (size_t i = 0; i < field_count; ++i) { | ||
avro_value_t field_value; | ||
const char* field_name; | ||
if (avro_value_get_by_index(&value, i, &field_value, &field_name) != 0) { | ||
return Status::InvalidArgument(strings::Substitute("Get record field error $0", avro_strerror())); | ||
} | ||
|
||
rapidjson::Value field_name_val; | ||
field_name_val.SetString(field_name, allocator); | ||
rapidjson::Value field_value_val; | ||
RETURN_IF_ERROR(avro_value_to_rapidjson(field_value, allocator, field_value_val)); | ||
out.AddMember(field_name_val, field_value_val, allocator); | ||
} | ||
return Status::OK(); | ||
} | ||
case AVRO_ENUM: { | ||
avro_schema_t enum_schema; | ||
int symbol_value; | ||
if (avro_value_get_enum(&value, &symbol_value) != 0) { | ||
return Status::InvalidArgument(strings::Substitute("Get enum value error $0", avro_strerror())); | ||
} | ||
|
||
enum_schema = avro_value_get_schema(&value); | ||
const char* symbol_name; | ||
symbol_name = avro_schema_enum_get(enum_schema, symbol_value); | ||
out.SetString(symbol_name, allocator); | ||
return Status::OK(); | ||
} | ||
case AVRO_FIXED: { | ||
const char* in; | ||
size_t size; | ||
if (avro_value_get_fixed(&value, (const void**)&in, &size) != 0) { | ||
return Status::InvalidArgument(strings::Substitute("Get fixed value error $0", avro_strerror())); | ||
} | ||
out.SetString(in, allocator); | ||
return Status::OK(); | ||
} | ||
case AVRO_MAP: { | ||
size_t map_size = 0; | ||
if (avro_value_get_size(&value, &map_size) != 0) { | ||
return Status::InvalidArgument(strings::Substitute("Get map size error $0", avro_strerror())); | ||
} | ||
|
||
out.SetObject(); | ||
for (int i = 0; i < map_size; ++i) { | ||
const char* key; | ||
avro_value_t map_value; | ||
if (avro_value_get_by_index(&value, i, &map_value, &key) != 0) { | ||
return Status::InvalidArgument(strings::Substitute("Get map key value error $0", avro_strerror())); | ||
} | ||
|
||
rapidjson::Value key_val; | ||
key_val.SetString(key, allocator); | ||
rapidjson::Value value_val; | ||
RETURN_IF_ERROR(avro_value_to_rapidjson(map_value, allocator, value_val)); | ||
out.AddMember(key_val, value_val, allocator); | ||
} | ||
return Status::OK(); | ||
} | ||
case AVRO_ARRAY: { | ||
size_t array_size = 0; | ||
if (avro_value_get_size(&value, &array_size) != 0) { | ||
return Status::InvalidArgument(strings::Substitute("Get array size error $0", avro_strerror())); | ||
} | ||
|
||
out.SetArray(); | ||
for (int i = 0; i < array_size; ++i) { | ||
avro_value_t element; | ||
if (avro_value_get_by_index(&value, i, &element, nullptr) != 0) { | ||
return Status::InvalidArgument(strings::Substitute("Get array element error $0", avro_strerror())); | ||
} | ||
|
||
rapidjson::Value element_value; | ||
RETURN_IF_ERROR(avro_value_to_rapidjson(element, allocator, element_value)); | ||
out.PushBack(element_value, allocator); | ||
} | ||
return Status::OK(); | ||
} | ||
case AVRO_UNION: { | ||
avro_value_t union_value; | ||
if (avro_value_get_current_branch(&value, &union_value) != 0) { | ||
return Status::InvalidArgument(strings::Substitute("Get union value error $0", avro_strerror())); | ||
} | ||
RETURN_IF_ERROR(avro_value_to_rapidjson(union_value, allocator, out)); | ||
return Status::OK(); | ||
} | ||
default: | ||
return Status::InvalidArgument("Unsupported avro type"); | ||
} | ||
} | ||
|
||
// Convert an avro value to a json object using rapidjson. | ||
// Different from avro `avro_value_to_json`, this function will ignore the union type tags. | ||
// | ||
// schema: | ||
// { | ||
// "type": "record", | ||
// "name": "User", | ||
// "fields": [ | ||
// {"name": "id", "type": "int"}, | ||
// {"name": "name", "type": "string"}, | ||
// {"name": "email", "type": ["null", | ||
// { | ||
// "type": "record", | ||
// "name": "email2", | ||
// "fields": [ | ||
// { | ||
// "name": "x", | ||
// "type" : ["null", "int"] | ||
// }, | ||
// { | ||
// "name": "y", | ||
// "type": ["null", "string"] | ||
// } | ||
// ] | ||
// } | ||
// ] | ||
// } | ||
// ] | ||
// } | ||
// | ||
// avro `avro_value_to_json` result: | ||
// {"id": 1, "name": "Alice", "email": {"email2": {"x": {"int": 1}, "y": {"string": "[email protected]"}}}} | ||
// | ||
// this function result: | ||
// {"id":1,"name":"Alice","email":{"x":1,"y":"[email protected]"}} | ||
static Status avro_value_to_json_str(const avro_value_t& value, std::string* json_str) { | ||
rapidjson::Document doc; | ||
auto& allocator = doc.GetAllocator(); | ||
rapidjson::Value root; | ||
RETURN_IF_ERROR(avro_value_to_rapidjson(value, allocator, root)); | ||
|
||
rapidjson::StringBuffer buffer; | ||
rapidjson::Writer<rapidjson::StringBuffer> writer(buffer); | ||
root.Accept(writer); | ||
json_str->append(buffer.GetString(), buffer.GetSize()); | ||
return Status::OK(); | ||
} | ||
|
||
static Status add_column_with_array_object_value(BinaryColumn* column, const TypeDescriptor& type_desc, | ||
const std::string& name, const avro_value_t& value) { | ||
char* as_json; | ||
if (avro_value_to_json(&value, 1, &as_json)) { | ||
LOG(ERROR) << "avro to json failed: %s" << avro_strerror(); | ||
return Status::InternalError("avro to json failed"); | ||
if (config::avro_ignore_union_type_tag) { | ||
std::string json_str; | ||
auto st = avro_value_to_json_str(value, &json_str); | ||
if (!st.ok()) { | ||
return Status::InternalError( | ||
strings::Substitute("avro to json failed. column=$0, err=$1", name, st.message())); | ||
} | ||
|
||
column->append(Slice(json_str)); | ||
} else { | ||
char* as_json; | ||
if (avro_value_to_json(&value, 1, &as_json)) { | ||
LOG(WARNING) << "avro to json failed: %s" << avro_strerror(); | ||
return Status::InternalError( | ||
strings::Substitute("avro to json failed. column=$0, err=$1", name, avro_strerror())); | ||
} | ||
DeferOp json_deleter([&] { free(as_json); }); | ||
column->append(Slice(as_json)); | ||
} | ||
column->append(Slice(as_json)); | ||
free(as_json); | ||
return Status::OK(); | ||
} | ||
|
||
|
@@ -251,18 +478,34 @@ Status add_binary_column(Column* column, const TypeDescriptor& type_desc, const | |
|
||
Status add_native_json_column(Column* column, const TypeDescriptor& type_desc, const std::string& name, | ||
const avro_value_t& value) { | ||
auto json_column = down_cast<JsonColumn*>(column); | ||
char* as_json; | ||
if (avro_value_to_json(&value, 1, &as_json)) { | ||
LOG(ERROR) << "avro to json failed: %s" << avro_strerror(); | ||
return Status::InternalError("avro to json failed"); | ||
} | ||
DeferOp json_deleter([&] { free(as_json); }); | ||
JsonValue json_value; | ||
Status s = JsonValue::parse(as_json, &json_value); | ||
if (!s.ok()) { | ||
return Status::InternalError("parse json failed"); | ||
Status st; | ||
if (config::avro_ignore_union_type_tag) { | ||
std::string json_str; | ||
st = avro_value_to_json_str(value, &json_str); | ||
if (!st.ok()) { | ||
return Status::InternalError( | ||
strings::Substitute("avro to json failed. column=$0, err=$1", name, st.message())); | ||
} | ||
|
||
st = JsonValue::parse(Slice(json_str), &json_value); | ||
} else { | ||
char* as_json; | ||
if (avro_value_to_json(&value, 1, &as_json)) { | ||
LOG(WARNING) << "avro to json failed: %s" << avro_strerror(); | ||
return Status::InternalError( | ||
strings::Substitute("avro to json failed. column=$0, err=$1", name, avro_strerror())); | ||
} | ||
|
||
DeferOp json_deleter([&] { free(as_json); }); | ||
st = JsonValue::parse(as_json, &json_value); | ||
} | ||
|
||
if (!st.ok()) { | ||
return Status::InternalError(strings::Substitute("parse json failed. column=$0, err=$1", name, st.message())); | ||
} | ||
|
||
auto json_column = down_cast<JsonColumn*>(column); | ||
json_column->append(std::move(json_value)); | ||
return Status::OK(); | ||
} | ||
|
Oops, something went wrong.