Skip to content

Commit 14c138f

Browse files
committed
feat: add binary to decimal and binary to hex transformations; update test cases and configuration
1 parent d0ae10f commit 14c138f

8 files changed

Lines changed: 201 additions & 12 deletions

File tree

core/dbio/database/database_bigquery.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -601,6 +601,7 @@ func (conn *BigQueryConn) importViaLocalStorage(tableFName string, df *iop.Dataf
601601

602602
go func() {
603603
config := iop.LoaderStreamConfig(true)
604+
config.TargetType = conn.GetType()
604605
_, err = fs.WriteDataflowReady(df, localPath, fileReadyChn, config)
605606

606607
if err != nil {
@@ -692,6 +693,7 @@ func (conn *BigQueryConn) importViaGoogleStorage(tableFName string, df *iop.Data
692693

693694
go func() {
694695
config := iop.LoaderStreamConfig(true)
696+
config.TargetType = conn.GetType()
695697
_, err = fs.WriteDataflowReady(df, gcsPath, fileReadyChn, config)
696698

697699
if err != nil {

core/dbio/database/database_snowflake.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -785,6 +785,7 @@ func (conn *SnowflakeConn) CopyViaStage(table Table, df *iop.Dataflow) (count ui
785785
}
786786

787787
config := iop.LoaderStreamConfig(true)
788+
config.TargetType = conn.GetType()
788789
_, err = fs.WriteDataflowReady(df, folderPath, fileReadyChn, config)
789790

790791
if err != nil {

core/dbio/iop/stream_processor.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1028,6 +1028,8 @@ func (sp *StreamProcessor) CastToString(i int, val interface{}, valType ...Colum
10281028
return tVal.Format("2006-01-02 15:04:05.999999999") + " +00"
10291029
}
10301030
return tVal.Format("2006-01-02 15:04:05.999999999 -07")
1031+
case typ.IsBinary() && g.In(sp.Config.TargetType, dbio.TypeDbSnowflake, dbio.TypeDbBigQuery):
1032+
return Transforms.BinaryToHex(cast.ToString(val))
10311033
default:
10321034
strVal := cast.ToString(val)
10331035
if !utf8.ValidString(strVal) {

core/dbio/iop/transforms.go

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,8 @@ func init() {
4444
TransformHashSha256,
4545
TransformHashSha512,
4646
TransformParseBit,
47+
TransformBinaryToDecimal,
48+
TransformBinaryToHex,
4749
TransformParseFix,
4850
TransformParseUuid,
4951
TransformParseMsUuid,
@@ -250,6 +252,20 @@ var (
250252
},
251253
}
252254

255+
TransformBinaryToDecimal = Transform{
256+
Name: "binary_to_decimal",
257+
FuncString: func(sp *StreamProcessor, val string) (string, error) {
258+
return Transforms.BinaryToDecimal(sp, val)
259+
},
260+
}
261+
262+
TransformBinaryToHex = Transform{
263+
Name: "binary_to_hex",
264+
FuncString: func(sp *StreamProcessor, val string) (string, error) {
265+
return Transforms.BinaryToHex(val), nil
266+
},
267+
}
268+
253269
TransformParseFix = Transform{
254270
Name: "parse_fix",
255271
FuncString: func(sp *StreamProcessor, val string) (string, error) {
@@ -494,6 +510,51 @@ func (t transformsNS) ParseBit(sp *StreamProcessor, val string) (string, error)
494510
return val, nil
495511
}
496512

513+
func (t transformsNS) BinaryToDecimal(sp *StreamProcessor, val string) (string, error) {
514+
// Handle MySQL BIT type which can be 1 to 64 bits (1 to 8 bytes)
515+
// Convert binary data to decimal representation for better compatibility
516+
if len(val) > 0 && len(val) <= 8 {
517+
// Check if it's binary data (all bytes are either printable or control chars)
518+
isBinary := true
519+
for _, b := range []byte(val) {
520+
// If we have high bit values or control characters, treat as binary
521+
if b > 127 || (b < 32 && b != 9 && b != 10 && b != 13) {
522+
isBinary = true
523+
break
524+
}
525+
// If we have regular ASCII text, don't treat as binary
526+
if b >= 32 && b <= 126 {
527+
isBinary = false
528+
}
529+
}
530+
531+
if isBinary {
532+
// Convert binary data to uint64 (big-endian)
533+
var result uint64
534+
for i, b := range []byte(val) {
535+
result |= uint64(b) << (8 * (len(val) - 1 - i))
536+
}
537+
return fmt.Sprintf("%d", result), nil
538+
}
539+
}
540+
return val, nil
541+
}
542+
543+
func (t transformsNS) BinaryToHex(val string) string {
544+
// Convert binary data to hexadecimal representation for Snowflake COPY
545+
if len(val) == 0 {
546+
return ""
547+
}
548+
549+
// Convert each byte to hex and concatenate
550+
hexStr := ""
551+
for _, b := range []byte(val) {
552+
hexStr += fmt.Sprintf("%02X", b)
553+
}
554+
555+
return hexStr
556+
}
557+
497558
func (t transformsNS) Replace0x00(sp *StreamProcessor, val string) (string, error) {
498559
return strings.ReplaceAll(strings.ReplaceAll(val, "\x00", ""), "\\u0000", "u-0000"), nil // replace the NUL character
499560
}

core/dbio/iop/transforms_test.go

Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,3 +68,135 @@ func TestTransformMsUUID(t *testing.T) {
6868
val, _ := Transforms.ParseMsUUID(sp, cast.ToString(uuidBytes))
6969
assert.Equal(t, "12345678-1234-1234-1234-123456789abc", val)
7070
}
71+
72+
func TestBinaryToDecimal(t *testing.T) {
73+
sp := NewStreamProcessor()
74+
75+
// Test cases for various BIT sizes
76+
testCases := []struct {
77+
name string
78+
input []byte
79+
expected string
80+
}{
81+
{
82+
name: "BIT(1) - 0",
83+
input: []byte{0x00},
84+
expected: "0",
85+
},
86+
{
87+
name: "BIT(1) - 1",
88+
input: []byte{0x01},
89+
expected: "1",
90+
},
91+
{
92+
name: "BIT(8) - 255",
93+
input: []byte{0xFF},
94+
expected: "255",
95+
},
96+
{
97+
name: "BIT(16) - 65535",
98+
input: []byte{0xFF, 0xFF},
99+
expected: "65535",
100+
},
101+
{
102+
name: "BIT(24) - 16777215",
103+
input: []byte{0xFF, 0xFF, 0xFF},
104+
expected: "16777215",
105+
},
106+
{
107+
name: "BIT(32) - 4294967295",
108+
input: []byte{0xFF, 0xFF, 0xFF, 0xFF},
109+
expected: "4294967295",
110+
},
111+
{
112+
name: "BIT(64) - max value",
113+
input: []byte{0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
114+
expected: "18446744073709551615",
115+
},
116+
{
117+
name: "BIT(8) - binary 10101010",
118+
input: []byte{0xAA}, // binary 10101010
119+
expected: "170",
120+
},
121+
{
122+
name: "BIT(16) - binary pattern",
123+
input: []byte{0x12, 0x34}, // 0x1234 = 4660
124+
expected: "4660",
125+
},
126+
{
127+
name: "Regular text should not be converted",
128+
input: []byte("hello"),
129+
expected: "hello",
130+
},
131+
}
132+
133+
for _, tc := range testCases {
134+
t.Run(tc.name, func(t *testing.T) {
135+
val, err := Transforms.BinaryToDecimal(sp, string(tc.input))
136+
assert.NoError(t, err)
137+
assert.Equal(t, tc.expected, val, "Failed for test case: %s", tc.name)
138+
})
139+
}
140+
}
141+
142+
func TestBinaryToHex(t *testing.T) {
143+
// Test cases for ToHex transform
144+
testCases := []struct {
145+
name string
146+
input []byte
147+
expected string
148+
}{
149+
{
150+
name: "Empty input",
151+
input: []byte{},
152+
expected: "",
153+
},
154+
{
155+
name: "Single byte - 0x00",
156+
input: []byte{0x00},
157+
expected: "00",
158+
},
159+
{
160+
name: "Single byte - 0x01",
161+
input: []byte{0x01},
162+
expected: "01",
163+
},
164+
{
165+
name: "Single byte - 0xFF",
166+
input: []byte{0xFF},
167+
expected: "FF",
168+
},
169+
{
170+
name: "Two bytes - 0x1234",
171+
input: []byte{0x12, 0x34},
172+
expected: "1234",
173+
},
174+
{
175+
name: "Four bytes - 0xDEADBEEF",
176+
input: []byte{0xDE, 0xAD, 0xBE, 0xEF},
177+
expected: "DEADBEEF",
178+
},
179+
{
180+
name: "Text - Hello",
181+
input: []byte("Hello"),
182+
expected: "48656C6C6F",
183+
},
184+
{
185+
name: "Eight bytes - all 0xFF",
186+
input: []byte{0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
187+
expected: "FFFFFFFFFFFFFFFF",
188+
},
189+
{
190+
name: "Alternating pattern",
191+
input: []byte{0xAA, 0x55, 0xAA, 0x55},
192+
expected: "AA55AA55",
193+
},
194+
}
195+
196+
for _, tc := range testCases {
197+
t.Run(tc.name, func(t *testing.T) {
198+
result := Transforms.BinaryToHex(string(tc.input))
199+
assert.Equal(t, tc.expected, result, "Failed for test case: %s", tc.name)
200+
})
201+
}
202+
}

core/dbio/scripts/test.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ go test -v -run 'TestConnection'
88
cd -
99

1010
cd iop
11-
go test -timeout 5m -v -run 'TestParseDate|TestDetectDelimiter|TestFIX|TestConstraints|TestDuckDb|TestParquetDuckDb|TestIcebergReader|TestDeltaReader|TestPartition|TestExtractPartitionTimeValue|TestGetLowestPartTimeUnit|TestMatchedPartitionMask|TestGeneratePartURIsFromRange|TestDataset|TestValidateNames|TestExcelDateToTime'
11+
go test -timeout 5m -v -run 'TestParseDate|TestDetectDelimiter|TestFIX|TestConstraints|TestDuckDb|TestParquetDuckDb|TestIcebergReader|TestDeltaReader|TestPartition|TestExtractPartitionTimeValue|TestGetLowestPartTimeUnit|TestMatchedPartitionMask|TestGeneratePartURIsFromRange|TestDataset|TestValidateNames|TestExcelDateToTime|TestBinaryToHex|TestBinaryToDecimal'
1212
cd -
1313

1414
cd database

core/dbio/templates/types_native_to_general.tsv

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -155,7 +155,7 @@ duckdb hugeint bigint col_hugeint hugeint TRUE TRUE
155155
duckdb interval string col_interval interval TRUE TRUE
156156
mariadb bigint bigint col_bigint bigint TRUE TRUE A large integer
157157
mariadb binary binary col_binary binary(100) TRUE TRUE A fixed-length binary string
158-
mariadb bit smallint col_bit bit FALSE FALSE A bit field
158+
mariadb bit binary col_bit bit FALSE FALSE A bit field
159159
mariadb blob text col_blob blob TRUE TRUE A small BLOB
160160
mariadb char string col_char char(17) TRUE TRUE A fixed-length nonbinary (character) string
161161
mariadb date date col_date date TRUE TRUE A date value in CCYY-MM-DD format
@@ -227,7 +227,7 @@ motherduck hugeint bigint col_hugeint hugeint TRUE TRUE
227227
motherduck interval string col_interval interval TRUE TRUE
228228
mysql bigint bigint col_bigint bigint TRUE TRUE A large integer
229229
mysql binary binary col_binary binary(100) TRUE TRUE A fixed-length binary string
230-
mysql bit smallint col_bit bit FALSE FALSE A bit field
230+
mysql bit binary col_bit bit FALSE FALSE A bit field
231231
mysql blob text col_blob blob TRUE TRUE A small BLOB
232232
mysql char string col_char char(17) TRUE TRUE A fixed-length nonbinary (character) string
233233
mysql date date col_date date TRUE TRUE A date value in CCYY-MM-DD format

core/sling/config.go

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -136,17 +136,8 @@ func (cfg *Config) SetDefault() {
136136
}
137137
}
138138

139-
// set default transforms
140-
switch cfg.SrcConn.Type {
141-
case dbio.TypeDbMySQL, dbio.TypeDbMariaDB, dbio.TypeDbStarRocks:
142-
// parse_bit for MySQL
143-
cfg.extraTransforms = append(cfg.extraTransforms, "parse_bit")
144-
}
145-
146139
// set default metadata
147140
switch {
148-
case g.In(cfg.TgtConn.Type, dbio.TypeDbStarRocks):
149-
cfg.extraTransforms = append(cfg.extraTransforms, "parse_bit")
150141
case g.In(cfg.TgtConn.Type, dbio.TypeDbBigQuery):
151142
cfg.Target.Options.DatetimeFormat = "2006-01-02 15:04:05.000000-07"
152143
}

0 commit comments

Comments
 (0)