From b8f485164e48062af70c30801f97e344051d4614 Mon Sep 17 00:00:00 2001
From: chenglongliu123 <100404670+chenglongliu123@users.noreply.github.com>
Date: Wed, 13 Sep 2023 15:37:17 +0800
Subject: [PATCH 1/2] [doc] change_feature_pdf_to_md (#415)
* change_feature_pdf_to_md
---
docs/source/feature/fg_docs/ComboFeature.md | 33 +++
docs/source/feature/fg_docs/IdFeature.md | 32 +++
docs/source/feature/fg_docs/LookupFeature.md | 112 ++++++++++
docs/source/feature/fg_docs/MatchFeature.md | 100 +++++++++
docs/source/feature/fg_docs/OverLapFeature.md | 56 +++++
docs/source/feature/fg_docs/RawFeature.md | 58 +++++
.../source/feature/fg_docs/SequenceFeature.md | 199 ++++++++++++++++++
docs/source/feature/fg_docs/mutiValues.md | 25 +++
docs/source/feature/rtp_fg.md | 44 ++--
9 files changed, 637 insertions(+), 22 deletions(-)
create mode 100644 docs/source/feature/fg_docs/ComboFeature.md
create mode 100644 docs/source/feature/fg_docs/IdFeature.md
create mode 100644 docs/source/feature/fg_docs/LookupFeature.md
create mode 100644 docs/source/feature/fg_docs/MatchFeature.md
create mode 100644 docs/source/feature/fg_docs/OverLapFeature.md
create mode 100644 docs/source/feature/fg_docs/RawFeature.md
create mode 100644 docs/source/feature/fg_docs/SequenceFeature.md
create mode 100644 docs/source/feature/fg_docs/mutiValues.md
diff --git a/docs/source/feature/fg_docs/ComboFeature.md b/docs/source/feature/fg_docs/ComboFeature.md
new file mode 100644
index 000000000..5e0495cdd
--- /dev/null
+++ b/docs/source/feature/fg_docs/ComboFeature.md
@@ -0,0 +1,33 @@
+# combo_feature
+
+## 功能介绍
+
+combo_feature是多个字段(或表达式)的组合(即笛卡尔积),id_feature可以看成是一种特殊的combo_feature,即参与交叉字段只有一个的combo_feature。一般来讲,参与交叉的各个字段来自不同的表(比如user特征和item特征进行交叉)。
+
+## 配置方法
+
+```
+{
+ "feature_type" : "combo_feature",
+ "feature_name" : "comb_u_age_item",
+ "expression" : ["user:age_class", "item:item_id"]
+}
+```
+
+## 例子
+
+^\]表示多值分隔符,注意这是一个符号,其ASCII编码是"\\x1D",而不是两个符号
+
+| user:age_class的取值 | item:item_id的取值 | 输出的feature |
+| ----------------- | --------------- | ---------------------------------------------------------------------------------------------------------- |
+| 123 | 45678 | comb_u_age_item_123_45678 |
+| abc, bcd | 45678 | comb_u_age_item_abc_45678, comb_u_age_item_bcd_45678 |
+| abc, bcd | 12345^\]45678 | comb_u_age_item_abc_12345, comb_u_age_item_abc_45678, comb_u_age_item_bcd_12345, comb_u_age_item_bcd_45678 |
+
+输出的feature个数等于
+
+```
+|F1| * |F2| * ... * |Fn|
+```
+
+其中Fn指依赖的第n个字段的值的个数。
diff --git a/docs/source/feature/fg_docs/IdFeature.md b/docs/source/feature/fg_docs/IdFeature.md
new file mode 100644
index 000000000..88a25488c
--- /dev/null
+++ b/docs/source/feature/fg_docs/IdFeature.md
@@ -0,0 +1,32 @@
+# id_feature
+
+## 功能介绍
+
+id_feature是一个sparse feature,是一种最简单的离散特征,只是简单的将某个字段的值与用户配置的feature名字拼接。
+
+## 配置方法
+
+```json
+{
+ "feature_type" : "id_feature",
+ "feature_name" : "item_is_main",
+ "expression" : "item:is_main"
+}
+```
+
+| 字段名 | 含义 |
+| -------------- | ----------------------------------------------------------------------------- |
+| feature_name | 必选项,feature_name会被当做最终输出的feature的前缀 |
+| expression | 必选项,expression描述该feature所依赖的字段来源 |
+| need_prefix | 可选项,true表示会拼上feature_name作为前缀,false表示不拼,默认为true,通常在shared_embedding的场景会用false |
+| invalid_values | 可选项,表示这些values都会被输出成null。list string,例如\[""\],表示将所有的空字符串输出变成null。 |
+
+例子 ( ^\]表示多值分隔符,注意这是一个符号,其ASCII编码是"\\x1D",而不是两个符号)
+
+| 类型 | item:is_main的取值 | 输出的feature |
+| -------- | --------------- | ------------------------------------------- |
+| int64_t | 100 | (item_is_main_100, 1) |
+| double | 5.2 | (item_is_main_5, 1)(小数部分会被截取) |
+| string | abc | (item_is_main_abc, 1) |
+| 多值string | abc^\]bcd | (item_is_main_abc, 1),(item_is_main_bcd, 1) |
+| 多值int | 123^\]456 | (item_is_main_123, 1),(item_is_main_456, 1) |
diff --git a/docs/source/feature/fg_docs/LookupFeature.md b/docs/source/feature/fg_docs/LookupFeature.md
new file mode 100644
index 000000000..be2f0b549
--- /dev/null
+++ b/docs/source/feature/fg_docs/LookupFeature.md
@@ -0,0 +1,112 @@
+# lookup_feature
+
+## 功能介绍
+
+如果离线生成不符合预期 请先使用最新的离线fg包
+
+lookup_feature 和 match_feature类似,是从一组kv中匹配到自己需要的结果。
+
+lookup_feature 依赖 map 和 key 两个字段,map是一个多值string(MultiString)类型的字段,其中每一个string的样子如"k1:v2"。;key可以是一个任意类型的字段。生成特征时,先是取出key的值,将其转换成string类型,然后在map字段所持有的kv对中进行匹配,获取最终的特征。
+
+map 和 key 源可以是 item,user,context 的任意组合。在线输入的时候item的多值用多值分隔符char(29)分隔,user和context的多值在tpp访问时用list表示。该特征仅支持json形式的配置方式。
+
+## 配置方法
+
+```json
+{
+ "features" : [
+ {
+ "feature_type" : "lookup_feature",
+ "feature_name" : "item_match_item",
+ "map" : "item:item_attr",
+ "key" : "item:item_value",
+ "needDiscrete" : true
+ }
+ ]
+}
+```
+
+对于上面的配置,假设对于某个 doc:
+
+```
+item_attr : "k1:v1^]k2:v2^]k3:v3"
+```
+
+^\]表示多值分隔符,注意这是一个符号,其ASCII编码是"\\x1D",而不是两个符号。该字符在emacs中的输入方式是C-q C-5, 在vi中的输入方式是C-v C-5。 这里item_attr是个多值string。需要切记,当map用来表征多个kv对时,是个多值string,而不是string!
+
+```
+item_value : "k2"
+```
+
+特征结果为 item_match_item_k2_v2。由于needDiscrete的值为true,所以特征结果为离散化后的结果。
+
+## 其它
+
+match_feature 和 lookup_feature都是匹配类型的特征,即从kv对中匹配到相应的结果。两者的区别是: match_feature的被匹配字段user 必须是qinfo中传入的字段,即一次查询中对所有的doc来说这个字段的值都是一致的。而 lookup_feature 的 key 和 map 没有来源的限制。
+
+## 配置详解
+
+默认情况的配置为 `needDiscrete == true, needWeighting = false, needKey = true, combiner = "sum"`
+
+### 默认输出
+
+### needWeighting == true
+
+```
+feature_name:fg
+map:{{"k1:123", "k2:234", "k3:3"}}
+key:{"k1"}
+结果:feature={"fg_k1", 123}
+```
+
+此时会用 string 部分查 weight 表,然后乘对应 feature value 用于 LR 模型。
+
+### needDiscrete == true
+
+```
+feature_name:fg
+map:{{"k1:123", "k2:234", "k3:3"}}
+key:{"k1"}
+结果:feature={"fg_123"}
+```
+
+### needDiscrete == false
+
+```
+map:{{"k1:123", "k2:234", "k3:3"}}
+key:{"k1"}
+结果:feature={123}
+```
+
+如果存在多个 key 时,可以通过配置 combiner 来组合多个查到的值。可能的配置有 `sum, mean, max, min`。 ps:如果要使用combiner的话需要将needDiscrete设置为false,只有dense类才能做combiner,生成的value会是数值类的
+
+一个配置样例 update on 2021.04.15
+
+```json
+"kv_fields_encode": [
+ {
+ "name": "cnty_dense_features",
+ "dimension": 99,
+ "min_hash_type": 0,
+ "use_sparse": true
+ },
+ {
+ "name": "cross_a_tag",
+ "dimension": 12,
+ "min_hash_type": 0,
+ "use_sparse": true
+ },
+ {
+ "name": "cross_gender",
+ "dimension": 12,
+ "min_hash_type": 0,
+ "use_sparse": true
+ },
+ {
+ "name": "cross_purchasing_power",
+ "dimension": 12,
+ "min_hash_type": 0,
+ "use_sparse": true
+ }
+ ]
+```
diff --git a/docs/source/feature/fg_docs/MatchFeature.md b/docs/source/feature/fg_docs/MatchFeature.md
new file mode 100644
index 000000000..4da869694
--- /dev/null
+++ b/docs/source/feature/fg_docs/MatchFeature.md
@@ -0,0 +1,100 @@
+# match_feature
+
+## 功能介绍
+
+match_feature一般用来做特征之间的匹配关系,要用到user,item和category三个字段的值。
+match_feature支持两种类型,hit和multi hit。
+match_feature本质是是一个两层map的匹配,user字段使用string的方式描述了一个两层map,|为第一层map的item之间的分隔符,^为第一层map的key与value之间的分隔符。,为第二层map的item之间的分隔符,:第二层map的key与value之间的分隔符。例如对于50011740^50011740:0.2,36806676:0.3,122572685:0.5|50006842^16788:0.1这样的一个string,转化为二层map就是
+
+```json
+{
+ "50011740" : {
+ "50011740" : 0.2,
+ "36806676" : 0.3,
+ "122572685" : 0.5
+ },
+ "50006842" : {
+ "16788" : 0.1
+ }
+}
+```
+
+对于hit match 匹配的方式,就是用category的值在第一层map中查找,然后使用item的值在第二层map中查找,最终得到一个结果。 如果不需要使用两层匹配,只需要一层匹配,则可以在map的第一层key中填入ALL, 然后在fg配置的category一项中也填成"ALL"即可。具体见实例一。
+
+## 配置方式
+
+json格式配置文件:
+
+```json
+{
+ "feature_name": "user__l1_ctr_1",
+ "feature_type": "match_feature",
+ "category": "ALL",
+ "needDiscrete": false,
+ "item": "item:category_level1",
+ "user": "user:l1_ctr_1",
+ "matchType": "hit"
+}
+```
+
+needDiscrete:true 时,模型使用 match_feature 输出的特征名,忽略特征值。默认为 true。
+needDiscrete:false 时,模型取 match_feature 输出的特征值,而忽略特征名。
+
+matchType:
+hit:输出命中的feature
+
+xml配置文件:
+
+```xml
+
+
+
+
+```
+
+dependencie:需要做Match 的两个特征
+
+category: 类目的feature 字段。category="ALL"不需要分类目匹配
+
+## Normalizer
+
+match_feature 支持和 raw_feature 一样的 normalizer,具体可见 [raw_feature](./RawFeature.md)。
+
+## 配置详解
+
+### hit
+
+对于下面的配置
+
+```json
+{
+ "feature_name": "brand_hit",
+ "feature_type": "match_feature",
+ "category": "item:auction_root_category",
+ "needDiscrete": true,
+ "item": "item:brand_id",
+ "user": "user:user_brand_tags_hit",
+ "matchType": "hit"
+}
+```
+
+假设各字段的值如下:
+
+| user_brand_tags_hit | `50011740^107287172:0.2,36806676:0.3,122572685:0.5\|50006842^16788816:0.1,10122:0.2,29889:0.3,30068:19` |
+| --------------------- | ------------------------------------------------------------------------------------------------------- |
+| brand_id | 30068 |
+| auction_root_category | 50006842 |
+
+如果 needDiscrete=true,结果为:\
+如果 needDiscrete=false,结果为:\
+如果只需要使用一层匹配,则需要将上面配置里的 category 的值改为 ALL。这种情况,用户也可以考虑使用 lookup_feature。 假设各字段的值如下
+
+| user_brand_tags_hit | ALL^16788816:40,10122:40,29889:20,30068:20 |
+| ------------------- | ------------------------------------------ |
+| brand_id | 30068 |
+
+如果 needDiscrete=true,结果:\ 如果 needDiscrete=false,结果:\
+
+### multihit
+
+允许用户 category 和 item 两个值为 ALL(注意,不是配置的值,是传入的值),进行 wildcard 匹配,可以匹配出多个值。输出结果类似于 hit。
diff --git a/docs/source/feature/fg_docs/OverLapFeature.md b/docs/source/feature/fg_docs/OverLapFeature.md
new file mode 100644
index 000000000..b6396db61
--- /dev/null
+++ b/docs/source/feature/fg_docs/OverLapFeature.md
@@ -0,0 +1,56 @@
+# overlap_feature
+
+## 功能介绍
+
+用来输出一些字符串字词匹配信息的feature
+
+离线推荐使用1.3.56-SNAPSHOT这个版本。 ps: 写fg的时候注意维度,title的维度要大于或等于query的问题(简单来说就是如果title是user特征,那query也只能是user特征,user特征的batch size为1,商品特征的batch size为商品数)
+
+| 方式 | 描述 | 备注 |
+| ------------------- | ----------------------------------------------- | ------------------ |
+| common_word | 计算query与title间重复term,并输出为fg_common1_common2 | 重复数不超过query term数 |
+| diff_word | 计算query与title间不重复term,并输出为fg_diff1_diff2 | 不重复数不超过query term数 |
+| query_common_ratio | 计算query与title间重复term数占query中term比例,乘以10取下整 | 取值为\[0,10\] |
+| title_common_ratio | 计算query与title间重复term数占title中term比例,乘以100取下整 | 取值为\[0,100\] |
+| is_contain | 计算query是否全部包含在title中,保持顺序 | 0表示未包含,1表示包含 |
+| is_equal | 计算query是否与title完全相同 | 0表示不完全相同,1表示完全相同 |
+| common_word_divided | 计算query与title间重复term,并输出为fg_common1, fg_common2 | 重复数不超过query term数 |
+| diff_word_divided | 计算query与title间不重复term,并输出为fg_diff1, fg_diff2 | 重复数不超过query term数 |
+
+## 配置方法
+
+```json
+ {
+ "feature_type" : "overlap_feature",
+ "feature_name" : "is_contain",
+ "query" : "user:attr1",
+ "title" : "item:attr2",
+ "method" : "is_contain",
+ "separator" : " "
+ }
+```
+
+| 字段名 | 含义 |
+| ------------ | -------------------------------------------------------------------------------------- |
+| feature_type | 必选项,描述改feature的类型 |
+| feature_name | 必选项,feature_name会被当做最终输出的feature的前缀 |
+| query | 必选项,query依赖的表, attr1是一个多值string, 多值string的分隔符使用chr(29) |
+| title | 必选项,title依赖的表, attr2是一个多值string |
+| method | 可填common_word, diff_word, query_common_ratio, title_common_ratio, is_contain, 对应上图五种方式 |
+| separator | 输出结果中的分割字符,不填写我们默认为\_ ,但也可以用户自己定制,具体看例子 |
+
+## 例子
+
+query为high,high2,fiberglass,abc
+title为high,quality,fiberglass,tube,for,golf,bag
+
+| method | separator | feature |
+| ------------------- | --------- | -------------------------- |
+| common_word | | name_high_fiberglass |
+| diff_word | " " | name high2 abc |
+| query_common_ratio | | name_5 |
+| title_common_ratio | | name_28 |
+| is_contain | | name_0 |
+| is_equal | | name_0 |
+| common_word_divided | | name_high, name_fiberglass |
+| diff_word_divided | | name_high2, name_abc |
diff --git a/docs/source/feature/fg_docs/RawFeature.md b/docs/source/feature/fg_docs/RawFeature.md
new file mode 100644
index 000000000..a7b0b772c
--- /dev/null
+++ b/docs/source/feature/fg_docs/RawFeature.md
@@ -0,0 +1,58 @@
+# raw_feature
+
+## 功能介绍
+
+raw_feature是一种dense的feature,是直接引用原始feature的字段值作为feature的value。raw feature仅支持数值int、float、double等数值类型,对非数值类型的feature需使用id feature。
+
+## 配置方法
+
+```json
+{
+ "feature_type" : "raw_feature",
+ "feature_name" : "ctr",
+ "expression" : "item:ctr",
+ "normalizer" : "method=log10"
+}
+```
+
+| 字段名 | 含义 |
+| --------------- | ---------------------------------------------------------------------------------- |
+| feature_name | 必选项,在正常使用时该选项是没用处的,因为实际参与接下来运算的主要是feature value,但是在debug的情况下,可以看到对应feature name的值。 |
+| expression | 必选项,expression描述该feature所依赖的字段来源 |
+| value_dimension | 可选项,默认值为1,表示输出的字段的维度。 |
+| normalizer | 可选项,归一化方法,详见后文 |
+
+## 例子
+
+^\]表示多值分隔符,注意这是一个符号,其ASCII编码是"\\x1D",而不是两个符号
+
+| 类型 | item:ctr的取值 | 输出的feature |
+| ------- | ----------- | ---------------------------------------------- |
+| int64_t | 100 | (ctr, 100) |
+| double | 100.1 | (ctr, 100.1) |
+| 多值int | 123^\]456 | (ctr, (123,456)) (注意,输入字段必须与配置的dimension维度一致) |
+
+## Normalizer
+
+raw_feature 和 match_feature 支持 normalizer,共三种,`minmax,zscore,log10`。配置和计算方法如下:
+
+### log10
+
+```
+配置例子:method=log10,threshold=1e-10,default=-10
+计算公式:x = x > threshold ? log10(x) : default;
+```
+
+### zscore
+
+```
+配置例子:method=zscore,mean=0.0,standard_deviation=10.0
+计算公式:x = (x - mean) / standard_deviation
+```
+
+### minmax
+
+```
+配置例子:method=minmax,min=2.1,max=2.2
+计算公式:x = (x - min) / (max - min)
+```
diff --git a/docs/source/feature/fg_docs/SequenceFeature.md b/docs/source/feature/fg_docs/SequenceFeature.md
new file mode 100644
index 000000000..f6333d2e0
--- /dev/null
+++ b/docs/source/feature/fg_docs/SequenceFeature.md
@@ -0,0 +1,199 @@
+# sequence类feature
+
+## 功能介绍
+
+⽤户的历史⾏为也是⼀个很重要的 feature。历史⾏为通常是⼀个序列,例如点击序列、购买序列等,组成这个序列的实体可能是商品本身。
+
+## 配置方法
+
+例如我们需要对⽤户的点击序列进⾏ fg,序列⻓度为 30,每个序列提取 nid 和 price, seq_context 特征。正常 item 维度有⼀个 feat0 特征。配置如下:
+
+```json
+{
+ "features":[
+ {
+ "feature_type":"raw_feature",
+ "feature_name":"feat0",
+ "expression":"user:feat0"
+ },
+ {
+ "sequence_name":"click",
+ "sequence_column":"click_field",
+ "sequence_length":10,
+ "sequence_delim":";",
+ "attribute_delim":"#",
+ "sequence_table":"item",
+ "sequence_pk":"user:user_behavior_seq",
+ "features":[
+ {
+ "feature_name":"nid",
+ "feature_type":"id_feature",
+ "value_type":"String",
+ "expression":"item:nid"
+ },
+ {
+ "feature_name":"price",
+ "feature_type":"raw_feature",
+ "expression":"item:price"
+ },
+ {
+ "feature_name":"seq_context",
+ "feature_type":"raw_feature",
+ "expression":"user:seq_context"
+ }
+ ]
+ }
+ ]
+}
+```
+
+### 在线 FG
+
+我们⽀持两种⽅式获取⾏为序列,⼀种如例⼦所示,我们以 `sequence_pk` 配置的字段为主键,RTP 会帮忙从 item 表中查到序列的对应字段值;另⼀种⽤户需要在 `qinfo` 中准备好所有的字段。
+
+#### RTP 取 sequence 字段
+
+第⼀种情况,`sequence_pk` 的⻓度应该⼩于等于 `sequence_length` 。如果 `sequence_pk` 指定的值不⾜ `sequence_length` 个会补⻬到 `sequence_length` ⻓度,fg 的结果会出默认值(dense 类是 0,sparse 类为空)。
+qinfo 例⼦:
+
+```json
+ {
+ "user:user_behavior_seq" : ["item_id_1", "item_id_2"]
+ }
+```
+
+#### qinfo 传递 sequence 字段
+
+第⼆种情况,sequence_feature 也⽀持所有的序列内容都从 qinfo 中传递。例如这⾥的`user:seq_context` 数组,他的值分别对应 `click_0` 和 `click_1` 。这种情况下⽤户可以忽略`sequence_table` 和 `sequence_pk` 。
+qinfo 例⼦:
+
+```json
+ {
+ "user:feat0" : 1.0,
+ "user:user_behavior_seq" : [0, 1],
+ "user:seq_context" : [2, 3]
+ }
+```
+
+#### context seq使⽤
+
+```
+{
+ "features": [{
+ "sequence_name": "click",
+ "sequence_column": "click_field",
+ "sequence_length": 30,
+ "sequence_delim": ";",
+ "attribute_delim": "#",
+ "sequence_table": "context_table",
+ "sequence_pk": "context:context_seq_id",
+ "features": [{
+ "feature_name": "cid",
+ "feature_type": "id_feature",
+ "value_type": "String",
+ "expression": "context_table:cid"
+ },
+ {
+ "feature_name": "price",
+ "feature_type": "raw_feature",
+ "expression": "context_table:price"
+ },
+ {
+ "feature_name": "seq_context",
+ "feature_type": "raw_feature",
+ "expression": "context:seq_context"
+ }
+ ]
+ }]
+}
+```
+
+context seq特征与user seq类似,区别是每个context是batch size维度的,user seq是⼀维的
+配置如上,context_seq_id为输⼊的context字段
+第⼀类特征:需要查context_table,如price特征,会根据context_seq_id查询context_table中的price,然后做fg,
+第⼆类特征:不需要context_table,如seq_context特征,会直接取seq_context做fg,
+
+#### item seq使⽤
+
+增加"is_item_seq": true配置,如下,
+
+```json
+{
+ "features": [{
+ "sequence_name": "item_pic_seq",
+ "sequence_column": "item__pic_vec_seq",
+ "sequence_table": "pic_table",
+ "sequence_pk": "item:pic_sop_id_list",
+ "attribute_delim": "#",
+ "feature_name": "item_pic_seq",
+ "sequence_length": 10,
+ "is_item_seq": true,
+ "features": [{
+ "normalizer": "method=log10",
+ "feature_type": "id_feature",
+ "shared_name": "pic_pv",
+ "hash_bucket_size": 10,
+ "need_prefix": false,
+ "embedding_dimension": 8,
+ "value_type": "String",
+ "feature_name": "pic_pv",
+ "expression": "pic_table:pv"
+ },
+ {
+ "normalizer": "method=log10",
+ "feature_type": "id_feature",
+ "shared_name": "pic_ipv",
+ "hash_bucket_size": 10,
+ "need_prefix": false,
+ "embedding_dimension": 8,
+ "value_type": "String",
+ "feature_name": "pic_ipv",
+ "expression": "pic_table:ipv"
+ },
+ {
+ "feature_type": "id_feature",
+ "shared_name": "bandit_level",
+ "hash_bucket_size": 100,
+ "need_prefix": false,
+ "embedding_dimension": 4,
+ "value_type": "String",
+ "feature_name": "bandit_level",
+ "expression": "pic_table:bandit_level"
+ },
+ {
+ "feature_type": "id_feature",
+ "shared_name": "is_fake_long",
+ "hash_bucket_size": 100,
+ "need_prefix": false,
+ "embedding_dimension": 4,
+ "value_type": "String",
+ "feature_name": "is_fake_long",
+ "expression": "pic_table:is_fake_long"
+ }
+ ]
+ }]
+}
+```
+
+### 离线 FG
+
+⽬前使⽤ sequence_feature 要求使⽤ 新新版 feature_generator_java , tensorflow 训练流程要求使⽤ rtp_fg.parse_genreated_fg。
+离线阶段没有sequence表去查,⽽是通过`sequence_column` 读取本来应该去表⾥查的字段。因此,`sequence_column ,sequence_delim ,attribute_delim` 这三个字段只有在离线 fg 阶段有⽤。`sequence_column` 是数据源odps表⾥所有 sequence 特征输⼊的字段名,离线fg会根据这个字段⾥的值⽣成sequence feature,该字段内容是 kv 格式的。`sequence_delim` 是sequence 中⾏为之间的分隔符,`attribute_delim` 是实际字段名字和字段值的分隔符。
+sequence_length 是 sequence 的⻓度,⽤户需要保证字段内容⼀定是补⻬到这个⻓度的。以上⾯的配置为例,⽤户需要有⼀个名字叫 click_field 的字段。假设某条record⾥它的内容是:
+
+```
+1 item__nid:11#item__price:2.0\u001D3.0;item__nid:22#item__price:4.0\u001D5.0
+```
+
+表示 `click_0` 和 `click_1` 中的字段分别是 `item__nid:11 item__price:2.0\u001D3.0` 和`item__nid:22 item__price:4.0\u001D5.0` 。fg 的结果会是:
+
+```
+"click_0_nid", "nid_11"
+"click_0_price", "2.0\u001D3.0"
+"click_0_seq_context", "0"
+"click_1_nid", "nid_22"
+"click_1_price", "4.0\u001D5.0"
+"click_1_seq_context", "0"
+```
+
+`rtp_fg.parse_genreated_fg` 的结果中我们可以获得 `click_0_nid , click_0_price ,click_0_seq_context ,click_1_nid , click_1_price , click_1_seq_context ,`分别对应 sequence 中两个 item 的结果。
diff --git a/docs/source/feature/fg_docs/mutiValues.md b/docs/source/feature/fg_docs/mutiValues.md
new file mode 100644
index 000000000..65a3b15a9
--- /dev/null
+++ b/docs/source/feature/fg_docs/mutiValues.md
@@ -0,0 +1,25 @@
+# 多值类型及分隔符
+
+## item: 维度
+
+例如 v1^\]v2^\]v3
+
+^\]表示多值分隔符,注意这是⼀个符号,其ASCII编码是"\\x1D",⽽不是两个符号。该字符在emacs 中的输⼊⽅式是C-q C-5, 在vi中的输⼊⽅式是 C-v C-5。
+
+## context: 和 user: 维度
+
+在线请求中,使⽤ json array 表示多值。
+
+离线 FG 过程中,和 item: ⼀样使⽤多值分隔符。
+
+## 注意事项
+
+浮点型的特征,rtp只保证6位精度
+
+## 训练模型时样本的分隔符
+
+⽣成的训练样本的分隔符为 ^ B,^ C,^ D, ASCII编码分别是"0x02","0x03","0x04" 0x04⽤户多值的时候的值之间的分隔
+
+例⼦如下:
+
+特征⼀\<0x03>值\<0x02>多值特征\<0x03>值\<0x04>值\<0x04>值\<0x02>
diff --git a/docs/source/feature/rtp_fg.md b/docs/source/feature/rtp_fg.md
index 91b2d2703..7fdc41bec 100644
--- a/docs/source/feature/rtp_fg.md
+++ b/docs/source/feature/rtp_fg.md
@@ -36,7 +36,7 @@
- Feature配置说明:
- - [id_feature](http://easyrec.oss-cn-beijing.aliyuncs.com/fg_docs/IdFeature.pdf)
+ - [id_feature](./fg_docs/IdFeature.md)
- is_multi: id_feature是否是多值属性
@@ -46,7 +46,7 @@
- 多值分隔符使用chr(29)\[ctrl+v ctrl+\], 即"\\u001D".
- - [多值类型说明](http://easyrec.oss-cn-beijing.aliyuncs.com/fg_docs/%E5%A4%9A%E5%80%BC%E7%B1%BB%E5%9E%8B.pdf)
+ - [多值类型说明](./fg_docs/mutiValues.md)
- vocab_file: 词典文件路径,根据词典将对应的输入映射成ID.
@@ -62,7 +62,7 @@
- embedding_dimension/embedding_dim: 对应EasyRec feature_config.features里面的embedding_dim.
- - [raw_feature](http://easyrec.oss-cn-beijing.aliyuncs.com/fg_docs/RawFeature.pdf)
+ - [raw_feature](./fg_docs/RawFeature.md)
- bucketize_boundaries: 会生成离散化的结果, 在生成EasyRec config的时候:
@@ -94,10 +94,10 @@
- 该选项对生成数据有影响.
- 该选项对生成EasyRec config也有影响, 对应到[feature_config.raw_input_dim](../proto.html#protos.FeatureConfig)
- - [combo_feature](http://easyrec.oss-cn-beijing.aliyuncs.com/fg_docs/ComboFeature.pdf)
+ - [combo_feature](./fg_docs/ComboFeature.md)
- 需要设置embedding_dimension和hash_bucket_size.
- 方法一:在fg中生成combo特征,见[ComboFeature](http://easyrec.oss-cn-beijing.aliyuncs.com/fg_docs/ComboFeature.pdf)
+ 方法一:在fg中生成combo特征,见[combo_feature](./fg_docs/ComboFeature.md)
```
{"expression" : ["user:user_id", "user:occupation"], "feature_name" : "combo__occupation_age_level", "feature_type" : "combo_feature", "hash_bucket_size": 10, "embedding_dim": 16}
@@ -124,11 +124,11 @@
- feature_names: 除当前特征外,参与combo的特征,至少一项.
- combiner, hash_bucket_size, embedding_dim 配置与上述一致.
- - [lookup_feature](http://easyrec.oss-cn-beijing.aliyuncs.com/fg_docs/LookupFeature.pdf)
+ - [lookup_feature](./fg_docs/LookupFeature.md)
- 单层查找, 根据id(如item_id, item_category_id等)查找对应的value.
- - [match_feature](http://easyrec.oss-cn-beijing.aliyuncs.com/fg_docs/MatchFeature.pdf)
+ - [match_feature](./fg_docs/MatchFeature.md)
- 双层查找, 根据category和item_id查找value.
@@ -140,7 +140,7 @@
- needWeighting: 生成特征权重,即kv格式, kv之间用\[ctrl+v ctrl+e\]分割, 转换成TagFeature.
- - [sequence_feature](http://easyrec.oss-cn-beijing.aliyuncs.com/fg_docs/SequenceFeature.pdf)
+ - [sequence_feature](./fg_docs/SequenceFeature.md)
- 序列特征用于对用户行为建模, 通常应用于DIN和Transformer模型当中
@@ -158,7 +158,7 @@
- Note: item_seq(如item的图片列表)目前还不支持
- - [overlap_feature](http://easyrec.oss-cn-beijing.aliyuncs.com/fg_docs/OverLapFeature.pdf)
+ - [overlap_feature](./fg_docs/OverLapFeature.md)
- 针对EasyRec的扩展字段:
@@ -223,16 +223,11 @@
| ----- | ------- | ------- | --------------- | --------------------------------------------------------------- | -------------------------------------------------- |
| 0 | 122017 | 389957 | | tag_category_list:4589,new_user_class_level:,...,user_id:122017 | adgroup_id:539227,pid:430548_1007,...,cate_id:4281 |
-```sql
--- taobao_train_input.txt oss://easyrec/data/rtp/
--- wget http://easyrec.oss-cn-beijing.aliyuncs.com/data/rtp/taobao_train_input.txt
--- wget http://easyrec.oss-cn-beijing.aliyuncs.com/data/rtp/taobao_test_input.txt
-drop table if exists taobao_train_input;
-create table if not exists taobao_train_input(`label` BIGINT,user_id STRING,item_id STRING,context_feature STRING,user_feature STRING,item_feature STRING);
-tunnel upload taobao_train_input.txt taobao_train_input -fd=';';
-drop table if exists taobao_test_input;
-create table if not exists taobao_test_input(`label` BIGINT,user_id STRING,item_id STRING,context_feature STRING,user_feature STRING,item_feature STRING);
-tunnel upload taobao_test_input.txt taobao_test_input -fd=';';
+提供了在任何项目下都可以访问两张样例表
+
+```
+pai_online_project.taobao_train_input
+pai_online_project.taobao_test_input
```
- 稠密格式的数据,每个特征是单独的一列,如:
@@ -242,7 +237,7 @@ tunnel upload taobao_test_input.txt taobao_test_input -fd=';';
| 1 | 122017 | 389957 | 4589 | | 0 |
```sql
- drop table if exists taobao_train_input;
+ drop table if exists taobao_train_input_dense;
create table taobao_train_input_dense(label bigint, user_id string, item_id string, tag_category_list bigint, ...);
```
@@ -267,10 +262,14 @@ set odps.sql.counters.dynamic.limit=true;
drop table if exists taobao_fg_train_out;
create table taobao_fg_train_out(label bigint, user_id string, item_id string, features string);
-jar -resources fg_on_odps-1.3.59-jar-with-dependencies.jar,fg.json -classpath fg_on_odps-1.3.59-jar-with-dependencies.jar com.taobao.fg_on_odps.EasyRecFGMapper -i taobao_train_input -o taobao_fg_train_out -f fg.json;
+-- dataworks内运行,注意需要带有resource_reference这一行
+--@resource_reference{"fg_on_odps-1.3.59-jar-with-dependencies.jar"}
+jar -resources fg_on_odps-1.3.59-jar-with-dependencies.jar,fg.json -classpath fg_on_odps-1.3.59-jar-with-dependencies.jar com.taobao.fg_on_odps.EasyRecFGMapper -i pai_online_project.taobao_train_input -o taobao_fg_train_out -f fg.json;
drop table if exists taobao_fg_test_out;
create table taobao_fg_test_out(label bigint, user_id string, item_id string, features string);
-jar -resources fg_on_odps-1.3.59-jar-with-dependencies.jar,fg.json -classpath fg_on_odps-1.3.59-jar-with-dependencies.jar com.taobao.fg_on_odps.EasyRecFGMapper -i taobao_test_input -o taobao_fg_test_out -f fg.json;
+-- dataworks内运行,注意需要带有resource_reference这一行
+--@resource_reference{"fg_on_odps-1.3.59-jar-with-dependencies.jar"}
+jar -resources fg_on_odps-1.3.59-jar-with-dependencies.jar,fg.json -classpath fg_on_odps-1.3.59-jar-with-dependencies.jar com.taobao.fg_on_odps.EasyRecFGMapper -i pai_online_project.taobao_test_input -o taobao_fg_test_out -f fg.json;
--下载查看数据(可选)
tunnel download taobao_fg_test_out taobao_fg_test_out.txt -fd=';';
@@ -281,6 +280,7 @@ tunnel download taobao_fg_test_out taobao_fg_test_out.txt -fd=';';
- 支持分区表,分区表可以指定partition,也可以不指定partition,不指定partition时使用所有partition
- **分区格式示例:** my_table/day=20201010,sex=male
- 可以用多个-i指定**多个表的多个分区**
+ - 支持添加project,示例:project.table/ds=xxx
- -o, 输出表,如果是分区表,一定要指定分区,只能指定一个输出表
- -f, fg.json
- -m, mapper memory的大小,默认可以不设置
From 1af9b3699fb5020efd7ead39cf14f8a49ecd9a3d Mon Sep 17 00:00:00 2001
From: wwxxzz
Date: Wed, 20 Sep 2023 11:42:21 +0800
Subject: [PATCH 2/2] update easyrec version to 0.7.1 (#420)
---
easy_rec/version.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/easy_rec/version.py b/easy_rec/version.py
index 5d3a16322..40c518215 100644
--- a/easy_rec/version.py
+++ b/easy_rec/version.py
@@ -1,3 +1,3 @@
# -*- encoding:utf-8 -*-
# Copyright (c) Alibaba, Inc. and its affiliates.
-__version__ = '0.7.0'
+__version__ = '0.7.1'