Binary Floating Point
Definition​
- Definition
- Explanation
- Guidance
- Tips
The Binary Floating Point Algorithm is a method used for representing real numbers in a binary format. It involves breaking down a real number into its binary representation consisting of a sign bit, an exponent, and a fraction (also known as mantissa). This algorithm ensures efficient storage and arithmetic operations on floating-point numbers.
Computers store floating-point numbers using the IEEE 754 standard, which allows for a wider range of values, including small numbers closer to zero and also utilizes biased exponents to allow for negative exponents.
This standard splits the representation of numbers into parts:
- sign
- exponent
- fraction
Different floating-point formats allocate varying numbers of bits for each part:
- half-precision (16 bits)
- single-precision (32 bits)
- double-precision (64 bits)
Given a real number, first determine its sign, then normalize it by adjusting its exponent and fraction, convert the normalized components into binary, combine them to form the floating-point representation, and finally output this binary representation
- Extract Sign
- if the number is positive, set the sign bit to 0 else set it to 1
- Normalize
- determine the exponent required to represent the number accurately. Adjust the fraction accordingly
- Convert to Binary
- convert the sign bit, exponent, and fraction into their binary representations
- Combine Components
- concatenate the binary representations of the sign, exponent, and fraction
- Output
- output the combined binary representation as the binary floating-point format of the input number
- ensure proper handling of special cases like zero, infinity, and NaN (Not a Number)
- consider the precision requirements of the application to determine the number of bits allocated for the exponent and fraction
- implement proper rounding techniques to minimize errors in floating-point arithmetic
Practice​
- Practice
- Solution
function binaryFloatingPointAlgorithm(realNumber):
// Step 1: Extract Sign
if realNumber < 0:
signBit = 1
else:
signBit = 0
// Step 2: Normalize
exponent = calculateExponent(realNumber)
fraction = calculateFraction(realNumber, exponent)
// Step 3: Convert to Binary
signBinary = convertToBinary(signBit)
exponentBinary = convertToBinary(exponent)
fractionBinary = convertToBinary(fraction)
// Step 4: Combine Components
binaryRepresentation = concatenate(signBinary, exponentBinary, fractionBinary)
// Step 5: Output
return binaryRepresentation
package main
import (
"encoding/binary"
)
func FloatAsBinaryString(floatNumber float64, byteLength int) string {
var numberAsBinaryString string
buf := make([]byte, byteLength)
switch byteLength {
case 4:
binary.LittleEndian.PutUint32(buf, math.Float32bits(float32(floatNumber)))
case 8:
binary.LittleEndian.PutUint64(buf, math.Float64bits(floatNumber))
}
for _, b := range buf {
numberAsBinaryString += fmt.Sprintf("%08b", b)
}
return numberAsBinaryString
}
import java.nio.ByteBuffer;
public class Main {
public static String floatAsBinaryString(float floatNumber, int byteLength) {
StringBuilder numberAsBinaryString = new StringBuilder();
ByteBuffer buffer = ByteBuffer.allocate(byteLength);
buffer.putFloat(floatNumber);
for (byte b : buffer.array()) {
numberAsBinaryString.append(String.format("%8s", Integer.toBinaryString(b & 0xFF)).replace(' ', '0'));
}
return numberAsBinaryString.toString();
}
}
function floatAsBinaryString(floatNumber, byteLength = 4) {
const singlePrecisionBytesLength = 4; // double precision is 8
const bitsInByte = 8;
let numberAsBinaryString = "";
const arrayBuffer = new ArrayBuffer(byteLength);
const dataView = new DataView(arrayBuffer);
const byteOffset = 0;
const littleEndian = false;
if (byteLength === singlePrecisionBytesLength) {
dataView.setFloat32(byteOffset, floatNumber, littleEndian);
} else {
dataView.setFloat64(byteOffset, floatNumber, littleEndian);
}
for (let byteIndex = 0; byteIndex < byteLength; byteIndex += 1) {
let bits = dataView.getUint8(byteIndex).toString(2);
if (bits.length < bitsInByte) {
bits = new Array(bitsInByte - bits.length).fill("0").join("") + bits;
}
numberAsBinaryString += bits;
}
return numberAsBinaryString;
}
import java.nio.ByteBuffer
fun floatAsBinaryString(floatNumber: Float, byteLength: Int): String {
val numberAsBinaryString = StringBuilder()
val buffer = ByteBuffer.allocate(byteLength)
buffer.putFloat(floatNumber)
for (b in buffer.array()) {
numberAsBinaryString.append(String.format("%8s", Integer.toBinaryString(b.toInt() and 0xFF)).replace(' ', '0'))
}
return numberAsBinaryString.toString()
}
import struct
def float_as_binary_string(float_number, byte_length):
number_as_binary_string = ""
if byte_length == 4:
number_as_binary_string = format(struct.unpack('<I', struct.pack('<f', float_number))[0], '032b')
elif byte_length == 8:
number_as_binary_string = format(struct.unpack('<Q', struct.pack('<d', float_number))[0], '064b')
return number_as_binary_string
use std::mem;
fn float_as_binary_string(float_number: f64, byte_length: usize) -> String {
let mut number_as_binary_string = String::new();
let mut bytes = [0; 8];
match byte_length {
4 => {
let float_as_bytes = float_number.to_le_bytes();
bytes[..4].clone_from_slice(&float_as_bytes);
},
8 => bytes = float_number.to_le_bytes(),
_ => panic!("Unsupported byte length"),
}
for byte in bytes.iter() {
number_as_binary_string.push_str(&format!("{:08b}", byte));
}
number_as_binary_string
}
function floatAsBinaryString(
floatNumber: number,
byteLength: number = 4,
): string {
let numberAsBinaryString: string = "";
const buffer: ArrayBuffer = new ArrayBuffer(byteLength);
const dataView: DataView = new DataView(buffer);
if (byteLength === 4) {
dataView.setFloat32(0, floatNumber, false);
} else if (byteLength === 8) {
dataView.setFloat64(0, floatNumber, false);
} else {
throw new Error("Unsupported byte length");
}
for (let byteIndex = 0; byteIndex < byteLength; byteIndex += 1) {
let bits = dataView.getUint8(byteIndex).toString(2);
if (bits.length < 8) {
bits = "0".repeat(8 - bits.length) + bits;
}
numberAsBinaryString += bits;
}
return numberAsBinaryString;
}