float8/float8.go at main · kshard/float8 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
//
// Copyright (C) 2024 Dmitry Kolesnikov
//
// This file may be modified and distributed under the terms
// of the MIT license.  See the LICENSE file for details.
// https://github.com/kshard/float8
//

// Package float8 implement minifloat (https://en.wikipedia.org/wiki/Minifloat)
// compatible with IEEE 754 and FP8 E4M3
// The number is defined as ±mantissa × 2^exponent
package float8

import (
	"math"
)

const (
	signMask     = 0b10000000 // 0x80
	exponentMask = 0b01111000 // 0x78
	mantissaMask = 0b00000111 // 0x07
	mantissaLen  = 3

	// See https://en.wikipedia.org/wiki/Exponent_bias
	//
	// bias = 2^(|exponent|-1) - 1
	// high = 2^|exponent| - 1
	exponentBias = 7
	exponentHi   = 15
	exponentLo   = -7

	// In a floating-point number representation, the mantissa (or significand)
	// represents the precision bits of the number. For an 8-bit minifloat with
	// 3 bits for the mantissa, these bits represent fractional values that
	// need to be converted to a floating-point format. These bits need to be
	// scaled to represent a fractional value between [1, 2). The bias normalize value.
	//
	// 2^|mantissa|
	// mantissaBias = 8.0

	// exponent base
	// base = 2

	//
	float32Bias = 127
)

const (
	Infinity = 0x7f | mantissaMask
)

// Float8 data type
type Float8 = uint8

// Convert float32 to float8
func ToFloat8(f32 float32) Float8 {
	if f32 == 0.0 {
		return 0x00
	}

	bits := math.Float32bits(f32)
	sign := uint8((bits >> 31) & 0x01)   // Extract sign (1 bit)
	exponent := int((bits >> 23) & 0xFF) // Extract exponent (8 bits)

	// Extract mantissa (23 bits) and add the implicit leading 1
	mantissa := int(bits & 0x7FFFFF)
	if exponent != 0 {
		mantissa |= 0x800000
	}

	// Adjust exponent from float32 bias (127) to minifloat bias (7)
	exponent = exponent - float32Bias + exponentBias

	// Handle overflow and underflow
	if exponent > exponentHi {
		return Infinity
	}
	if exponent < 0 {
		return 0x00
	}

	// Normalize mantissa to fit into 3 bits
	shift := 20 // Shift to convert 23-bit mantissa to 3-bit
	mantissa = (mantissa >> shift) & mantissaMask

	return (sign << 7) | (uint8(exponent) << 3) | uint8(mantissa)
}

// Convert []float32 to []float8
// Note: the function is faster than standard range over []float32
func ToSlice8(f32s []float32) (f8s []Float8) {
	if len(f32s)%4 != 0 {
		panic("slice length must be multiple of 4")
	}

	f8s = make([]uint8, len(f32s))
	for i := 0; i < len(f32s); i += 4 {
		a := f32s[i : i+4 : i+4]
		b := f8s[i : i+4 : i+4]

		b[0], b[1], b[2], b[3] = ToFloat8(a[0]), ToFloat8(a[1]), ToFloat8(a[2]), ToFloat8(a[3])
	}

	return
}

// Convert float8 to float32
func ToFloat32(f8 Float8) float32 { return f8tof32[f8] }

// Add float8(s)
func Add(a, b Float8) Float8 { return add[int(a)<<8|int(b)] }

// Subtract float8(s)
func Sub(a, b Float8) Float8 { return sub[int(a)<<8|int(b)] }

// Multiply float8(s)
func Mul(a, b Float8) Float8 { return mul[int(a)<<8|int(b)] }

// Divide float8(s)
func Div(a, b Float8) Float8 { return div[int(a)<<8|int(b)] }