libnds Check if faster floating point functions using existing hardware capabilities are possible

Feature Request

What feature are you suggesting?

Overview:

https://datasheets.raspberrypi.com/pico/raspberry-pi-pico-c-sdk.pdf

This seems to suggest that gcc's soft float emulation implementation is not very fast, and could be 50 -100 % faster.

Smaller Details:

I have written some small implementations that could be used for this purpose. What still needs to be done is to benchmark them and look for possible optimizations.

for example

f32 fastfloat(s32 x){
    //convert 20.12 fixed point number to float without using fmul
    union{
        f32 fnum;
        u32 original;
    } data; //union because pointer casting isnt compiler sanctioned
    data.original=x;
    if (x==0) return data.fnum;
    u32 f=-(x<0)*x+(x>0)*x; //absolute value
    s32 clz =__builtin_clz(f); //make sure this is the appropriate clz for your system; should return 31 for the input 1
    //clz is being used as a standin for log2 here
    s32 shift= clz-8;
    //adjust this if you work with a different float type. The value 8 fromes from 31-23, so for 64 bit it'd be 63-52
    u32  mantissa= (shift>=0 ?  f<<  shift : f>>-shift)  -( 1<<23 ) ;  
    //adjust this if you work with a different float type. 23 is the number of explicitly stored bits in the mantissa.
    //can also replace the subtraction with a bitwise  & ((1<<24)-1)    
    s32 exponent=138-shift  ; 
    //adjust this if you work with a different fixed point type. 138 is 150-12 , and 150 is 127+23, so for a double it'd probably be 1023+52-12 
    exponent=exponent<<23;  //adjust this for a different float type
    u32 sign= x & (1<<31); //adjust this for a different float type
    data.original= sign |exponent |mantissa;
    return data.fnum;
}

f32 fsqrt(f32 x){
    union{f32 f; u32 i;}xu;
    xu.f=x;
    //grab exponent
    s32 exponent= (xu.i & (0xff<<23));
    if(exponent==0)return 0.0;
    exponent=exponent-(127<<23);

    exponent=exponent>>1; //right shift on negative number depends on compiler
    u64 mantissa=xu.i & ((1<<23)-1);
    mantissa=(mantissa+(1<<23))<<23;
    if ((exponent & (1<<22))>0){
    mantissa=mantissa<<1;
    }
    u32 new_mantissa= (u32) sqrt(mantissa); //modify this line to use the NDS 64 bit hardware sqrt 
    xu.i= ((exponent+(127<<23))& (0xff<<23) ) | (new_mantissa & ((1<<23)-1));
    return xu.f;
}

Could be used for sqrtf(), and it would mostly likely be orders of magnitude faster than the implementation in <math.h>

In projects where an error of at most 1 part in a hundred thousand is allowable (smaller 2**-16, most of the time it is around 1-3 parts in 1 million) , fmuls could for example be done like this. This compiles to about 31 instructions in arm mode, and the assembly could potentially be hand optimized (for example replacing the mul with an SMULWxy) .

This can be made more accurate by using a long mul u32 mantissa=i+j+(((i>>7)*(j>>7))>>9) for the multiplication of i and j and downshifting by >>23 but it costs some speed and requires more registers, i.e. doing ((i*j)>>23)

f32 fmul( f32 x, f32 y){
    union {
	u32 i;
	f32 f;
    }xu;
    xu.f=x;
    union {
	u32 j;
	f32 f;
    }yu;
    yu.f=y;
    u32 a =xu.i & (1<<31);//get sign of x
    u32 b =yu.j & (1<<31);//get sign of y
    u32 exponentx=(xu.i) & (0xff<<23);
    u32 exponenty=(yu.j) & (0xff<<23);
    s32 combined_exponent =(exponentx -(127 <<23))+ exponenty ;
    if (combined_exponent<= (1<<23) ){
	return 0.0; //this deletes sign information but I dont care
   }
    u32 i=(xu.i & 0x7fffff); //should be explicit bits of mantissa
    u32 j=(yu.j & 0x7fffff);
    u32 mantissa=i+j+(((i>>7)*(j>>7))>>9);
    mantissa =mantissa +  (1<< 23) ;
    s32 clz=__builtin_clz(mantissa);
    s32 shift=8-clz;
    mantissa= shift>=0 ? mantissa>>shift: mantissa<<-shift;
    mantissa= mantissa & ((1<<23)-1);
    u32 exponent=(combined_exponent+(shift<<23 ));
    u32 sign= a^b;
    xu.i= sign |exponent | mantissa;
    return xu.f;
}

In addition, a combination fast exp() and log functions could be used to replace pow() functions when the error on both is small enough. Cawley, G. C. (2000). On a Fast, Compact Approximation of the Exponential Function. Neural Computation has a floating point exp function approximation that only uses a 4-5 integer additions and which doesnt require an FPU.

Vinyals, O., & Friedland, G. (2008). A Hardware-Independent Fast Logarithm Approximation with Adjustable Accuracy. 2008 Tenth IEEE International Symposium on Multimedia seems to have a log implementation that could be adapted.

Nature of Request:

Addition

Why would this feature be useful?

It makes using floating point functions faster, mostly useful for porting existing code more quickly.

Projects that are written from scratch should not use these functions, as they would still be slower than fixed point. An alternative could be libraries that use fixed point with a syntax that makes it possible to use them as if they were floating point numbers, for example https://github.com/MikeLankamp/fpm .

Mar 15 '24 16:03 Kuratius

f32 fmull( f32 x, f32 y){
    union {
    u32 i;
    f32 f;
    }xu;
    xu.f=x;
    union {
    u32 j;
    f32 f;

    }yu;
    yu.f=y;

    u32 a =xu.i & (1<<31);//get sign of x
    u32 b =yu.j & (1<<31);//get sign of y
    
    s32 exponentx=(xu.i ) &( 0xff<<23);
    s32 exponenty=(yu.j) &(0xff<<23);
    if(exponentx==0 || exponenty==0) return 0.0;
    s32 combined_exponent =(exponentx- (127<<23)) +exponenty;

    u32 i=(xu.i & 0x7fffff); //should be explicit bits of mantissa
    u32 j=(yu.j & 0x7fffff);
    //i=i<<4;
    //j=j<<5;
    u32 mantissa=i+j+(((u64)(i<<4)*(u64)(j<<5))>>32);
    mantissa =mantissa +  (1<< 23) ;
    s32 clz=__builtin_clz(mantissa);

    s32 shift=8-clz;

    mantissa= shift>=0 ? mantissa>>shift: mantissa<<-shift;

    mantissa= mantissa & ((1<<23)-1);
    u32 exponent=(combined_exponent+(shift<<23));
    u32 sign= a^b;
    union{ 
    f32 f;
    u32 k;
    }result;
    result.k= sign |exponent | mantissa;

    return combined_exponent<=(0<<23)? 0.0:result.f ;

}

This should have full accuracy for the fmul and only uses ~28 instructions.

This is achieved by the compiler spotting that it can just ignore the lower register of the umull instruction result and only use the Hi register, removing the >>32 shift. By adjusting the <<4 shift to be a <<5 shift, proper rounding can be added to this as well, though I wouldn't consider it particularly necessary.

Floating point division and floating point square roots that use the hardware functions asynchronously are also possible. For example


f32 myfdiv(f32 x, f32 y ) {
    union {
    u32 i;
    f32 f;
    }xu;
    xu.f=x;
    union {
    u32 j;
    f32 f;

    }yu;
    yu.f=y;
    u64 i=((xu.i & ((1<<23)-1))|(1<<23));    
    u32 j=((yu.j & ((1<<23)-1))|(1<<23));
    REG_DIVCNT = DIV_64_32; 
    REG_DIV_NUMER = (i<<23);
    REG_DIV_DENOM_L = j;
    s16 shift= i<j ? -1 : 0; 
    u64 m=(xu.i) & (0xff<<23);
    u64 n=(yu.j) & (0xff<<23);
    s32 exponent=m-n+(shift<<23)+(127<<23);
    u32 a =xu.i & (1<<31);//get sign of x
    u32 b =yu.j & (1<<31);//get sign of y
    u32 sign=a^b;
    if ((exponent<=(1<<23))){
        while(REG_DIVCNT & DIV_BUSY);
        return 0.0;
    }
    while(REG_DIVCNT & DIV_BUSY);
    u32 mantissa=REG_DIV_RESULT_L;
    mantissa=shift>=0 ? mantissa>>(shift) : mantissa<<-shift;
    mantissa=mantissa & ((1<<23)-1);
    xu.i= sign | exponent | mantissa;
    return xu.f;
}

f32 myfsqrt(f32 x){
    union{f32 f; u32 i;}xu;
    xu.f=x;
    u64 mantissa=xu.i & ((1<<23)-1);
    //check if exponent is odd
    //before subtracting 127 exponent was even if it is odd now
    //therefore check if last digit is 0 
    mantissa=(mantissa+(1<<23))<<   ( (( xu.i & (1<<23))==0  )      +23 );
    REG_SQRTCNT = SQRT_64;
    REG_SQRT_PARAM = mantissa;
    u32 raw_exponent= (xu.i & (0xff<<23));
    if (raw_exponent > 0 ) {
        s32 exponent=raw_exponent-(127<<23);
        exponent=exponent>>1; //right shift on negative number depends on compiler
        exponent=((exponent+(127<<23))& (0xff<<23) );
    //fetch async result here
    
        while(REG_SQRTCNT & SQRT_BUSY);
        xu.i=  exponent| (REG_SQRT_RESULT & ((1<<23)-1));
        return xu.f;
    } else{
        while(REG_SQRTCNT & SQRT_BUSY);
        return 0.0;
    };

}

Mar 19 '24 13:03 Kuratius

The rpi pico is in a similar situation, for their case they just added a compiler switch that lets users choose between the gcc float implementation and handpicked assembly routines, so this may be desirable here as well. In addition, it may be desirable for functions like this to reside in the fastest memory possible.

Mar 19 '24 13:03 Kuratius