-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathfloat8.go
More file actions
120 lines (95 loc) · 2.95 KB
/
float8.go
File metadata and controls
120 lines (95 loc) · 2.95 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
//
// Copyright (C) 2024 Dmitry Kolesnikov
//
// This file may be modified and distributed under the terms
// of the MIT license. See the LICENSE file for details.
// https://github.com/kshard/float8
//
// Package float8 implement minifloat (https://en.wikipedia.org/wiki/Minifloat)
// compatible with IEEE 754 and FP8 E4M3
// The number is defined as ±mantissa × 2^exponent
package float8
import (
"math"
)
const (
signMask = 0b10000000 // 0x80
exponentMask = 0b01111000 // 0x78
mantissaMask = 0b00000111 // 0x07
mantissaLen = 3
// See https://en.wikipedia.org/wiki/Exponent_bias
//
// bias = 2^(|exponent|-1) - 1
// high = 2^|exponent| - 1
exponentBias = 7
exponentHi = 15
exponentLo = -7
// In a floating-point number representation, the mantissa (or significand)
// represents the precision bits of the number. For an 8-bit minifloat with
// 3 bits for the mantissa, these bits represent fractional values that
// need to be converted to a floating-point format. These bits need to be
// scaled to represent a fractional value between [1, 2). The bias normalize value.
//
// 2^|mantissa|
// mantissaBias = 8.0
// exponent base
// base = 2
//
float32Bias = 127
)
const (
Infinity = 0x7f | mantissaMask
)
// Float8 data type
type Float8 = uint8
// Convert float32 to float8
func ToFloat8(f32 float32) Float8 {
if f32 == 0.0 {
return 0x00
}
bits := math.Float32bits(f32)
sign := uint8((bits >> 31) & 0x01) // Extract sign (1 bit)
exponent := int((bits >> 23) & 0xFF) // Extract exponent (8 bits)
// Extract mantissa (23 bits) and add the implicit leading 1
mantissa := int(bits & 0x7FFFFF)
if exponent != 0 {
mantissa |= 0x800000
}
// Adjust exponent from float32 bias (127) to minifloat bias (7)
exponent = exponent - float32Bias + exponentBias
// Handle overflow and underflow
if exponent > exponentHi {
return Infinity
}
if exponent < 0 {
return 0x00
}
// Normalize mantissa to fit into 3 bits
shift := 20 // Shift to convert 23-bit mantissa to 3-bit
mantissa = (mantissa >> shift) & mantissaMask
return (sign << 7) | (uint8(exponent) << 3) | uint8(mantissa)
}
// Convert []float32 to []float8
// Note: the function is faster than standard range over []float32
func ToSlice8(f32s []float32) (f8s []Float8) {
if len(f32s)%4 != 0 {
panic("slice length must be multiple of 4")
}
f8s = make([]uint8, len(f32s))
for i := 0; i < len(f32s); i += 4 {
a := f32s[i : i+4 : i+4]
b := f8s[i : i+4 : i+4]
b[0], b[1], b[2], b[3] = ToFloat8(a[0]), ToFloat8(a[1]), ToFloat8(a[2]), ToFloat8(a[3])
}
return
}
// Convert float8 to float32
func ToFloat32(f8 Float8) float32 { return f8tof32[f8] }
// Add float8(s)
func Add(a, b Float8) Float8 { return add[int(a)<<8|int(b)] }
// Subtract float8(s)
func Sub(a, b Float8) Float8 { return sub[int(a)<<8|int(b)] }
// Multiply float8(s)
func Mul(a, b Float8) Float8 { return mul[int(a)<<8|int(b)] }
// Divide float8(s)
func Div(a, b Float8) Float8 { return div[int(a)<<8|int(b)] }