F# Performance Tweaking

Article
09/16/2008

Jomo Fisher—I’ve enjoyed reading the blog series on F# and game development here. Joh had posted an example of F# of using RK4 and showed some performance numbers comparing F# to C++ and OCaml. Don Syme had been reading too and asked if anyone on the F# team wanted to take a stab to see if they could optimize Joh’s code any further. I thought it would be fun. It turns out there were a few small changes that make a significant performance improvement without sacrificing immutability or other nice functionalness.

The example code in this article was compiled with the 1.9.6.2 version of the F# compiler.

Joh probably already accounted for this, but it’s worth pointing out that there is a huge performance difference between Debug and Release builds for this example. On my machine I could measure a 3x performance difference between the two.

Considering just Release builds, I was able to get another 2-3x improvement by making several simple changes:

- Change a few functions to return struct instead of tuple. For this, I created a ps (e.g. Position-Speed) struct and a sa (e.g. Speed-Acceleration) struct.

- I manually lifted a few computations out of the outer similar loop. I made (euler_implicit_opt accel_opt) and (runge_kutta_4_opt (adapt_opt accel_opt))into values.

- I changed the simulate function to use OptimizedClosures.FastFunc3.

This last part especially needs some explaining. Closure optimization can be used when a function will be called many times. It allows F# to do a bit of expensive work up front that will later be amortized across the many calls to the function. Below is the simulate function using optimized closures. I’ve bolded the affected lines.

let inline simulate_opt intg_func t0 t_fin delta pos0 speed0 =

let oc = OptimizedClosures.FastFunc3<_,_,_,_>.Adapt(intg_func)

let rec repeat t0 pos0 speed0 =

if t0 > t_fin then (pos0, speed0)

else

let t1 = t0 + delta

let ps:ps<_> = oc.Invoke(pos0,speed0,delta)

repeat t1 ps.pos ps.speed

repeat t0 pos0 speed0

Unfortunately, this is not something that you can stick everywhere and just get a benefit (otherwise, the compiler could just inject it for you). You should definitely profile before and after. Here are my final performance numbers:

Optimized---

Euler single: 2.529253

Euler multi: 1.328133

RK4 single: 2.383238

RK4 multi: 1.241124

Baseline---

Euler single: 7.672767

Euler multi: 3.922392

RK4 single: 4.563456

RK4 multi: 2.450245

Here’s the entire code. The optimized functions end in _opt. Thanks Joh for an awesome diversion!

#light

open System

let run_time_func (func: unit -> 'a) =

let time0 = System.DateTime.Now

let res = func ()

let diff0 = System.DateTime.Now - time0

(diff0, res)

let map_parallel func items =

let tasks =

seq {

for i in items -> async {

return (func i)

}

Async.Run (Async.Parallel tasks)

let gravity =

-9.81

let inline spring pos =

let k = 100.0 in

-k * pos

let inline drag k pos speed = -k * speed

type ps<'num> = struct

val pos:'num

val speed:'num

new(pos,speed) = {pos=pos;speed=speed}

end

type sa<'num> = struct

val speed:'num

val accel:'num

new(speed,accel) = {speed=speed;accel=accel}

end

let inline euler_implicit accel_func pos speed delta =

let speed2 = speed + delta * accel_func pos speed in

pos + delta * speed2, speed2

let inline euler_implicit_opt accel_func pos speed delta =

let speed2 = speed + delta * accel_func pos speed in

ps(pos + delta * speed2, speed2)

let inline adapt (f: 'num -> 'num -> 'num) =

fun pos speed ->

let accel = f pos speed in

speed, accel

let inline adapt_opt (f: 'num -> 'num -> 'num) =

fun pos speed ->

let accel = f pos speed in

sa(speed, accel)

let inline runge_kutta_4 deriv_func pos speed delta =

let half_delta = 0.5 * delta

let s1, a1 = deriv_func pos speed

let s2, a2 = deriv_func (pos + half_delta * s1) (speed + half_delta * a1)

let s3, a3 = deriv_func (pos + half_delta * s2) (speed + half_delta * a2)

let s4, a4 = deriv_func (pos + delta * s3) (speed + delta * a3)

let pos1 = pos + delta/6.0*(s1 + 2.0*s2 + 2.0*s3 + s4)

let speed1 = speed + delta/6.0*(a1 + 2.0*a2 + 2.0*a3 + a4) in

pos1, speed1

let inline runge_kutta_4_opt deriv_func pos speed delta =

let half_delta = 0.5 * delta

let sa1:sa<_> = deriv_func pos speed

let sa2 = deriv_func (pos + half_delta * sa1.speed) (speed + half_delta * sa1.accel)

let sa3 = deriv_func (pos + half_delta * sa2.speed) (speed + half_delta * sa2.accel)

let sa4 = deriv_func (pos + delta * sa3.speed) (speed + delta * sa3.speed)

let pos1 = pos + delta/6.0*(sa1.speed + 2.0*sa2.speed + 2.0*sa3.speed + sa4.speed)

let speed1 = speed + delta/6.0*(sa1.accel + 2.0*sa2.accel + 2.0*sa3.accel + sa4.accel) in

ps(pos1, speed1)

let inline simulate intg_func t0 t_fin delta pos0 speed0 =

let rec repeat t0 pos0 speed0 =

if t0 > t_fin then (pos0, speed0)

else

let t1 = t0 + delta

let pos1, speed1 = intg_func pos0 speed0 delta

repeat t1 pos1 speed1

repeat t0 pos0 speed0

let inline simulate_opt intg_func t0 t_fin delta pos0 speed0 =

let oc = OptimizedClosures.FastFunc3<_,_,_,_>.Adapt(intg_func)

let rec repeat t0 pos0 speed0 =

if t0 > t_fin then (pos0, speed0)

else

let t1 = t0 + delta

let ps:ps<_> = oc.Invoke(pos0,speed0,delta)

repeat t1 ps.pos ps.speed

repeat t0 pos0 speed0

let initial_states = [ for s in 1..1000 -> (float s * 1.0, 0.0) ]

let t0 = 0.0

let t_fin = 1000.0

let delta = 0.025

let accel pos speed = (drag 1.0) pos speed + spring pos + gravity in

let accel_opt pos speed = (drag 1.0) pos speed + spring pos + gravity in

let ef = (euler_implicit_opt accel_opt)

let rk4f = (runge_kutta_4_opt (adapt_opt accel_opt))

// Changes

// - Change euler_implicit_opt and runge_kutta_4_opt to return a struct (ps)

// - Manually lift (euler_implicit_opt accel) and (runge_kutta_4_opt (adapt accel)) out of loop

// - Change adapt_opt to return struct (sa)

// - Use optimized closure in simulate_opt

printfn "Optimized---"

for i in 0..0 do

let (run_time, res) = run_time_func (fun () -> initial_states |> List.map(fun (x,y) -> simulate_opt ef t0 t_fin (0.25*delta) x y))

printfn "Euler single: %f" run_time.TotalSeconds;

let (run_time, res) = run_time_func (fun () -> initial_states |> map_parallel(fun (x,y) -> simulate_opt ef t0 t_fin (0.25*delta) x y))

printfn "Euler multi: %f" run_time.TotalSeconds;

let (run_time, res) = run_time_func (fun () -> initial_states |> List.map(fun (x,y) -> simulate_opt rk4f t0 t_fin delta x y))

printfn "RK4 single: %f" run_time.TotalSeconds;

let (run_time, res) = run_time_func (fun () -> initial_states |> map_parallel(fun (x,y) -> simulate_opt rk4f t0 t_fin delta x y))

printfn "RK4 multi: %f" run_time.TotalSeconds;

printfn "Baseline---"

for i in 0..0 do

let (run_time, res) = run_time_func (fun () -> initial_states |> List.map(fun (x,y) -> simulate (euler_implicit accel) t0 t_fin (0.25*delta) x y))

printfn "Euler single: %f" run_time.TotalSeconds;

let (run_time, res) = run_time_func (fun () -> initial_states |> map_parallel(fun (x,y) -> simulate (euler_implicit accel) t0 t_fin (0.25*delta) x y))

printfn "Euler multi: %f" run_time.TotalSeconds;

let (run_time, res) = run_time_func (fun () -> initial_states |> List.map(fun (x,y) -> simulate (runge_kutta_4 (adapt accel)) t0 t_fin delta x y))

printfn "RK4 single: %f" run_time.TotalSeconds;

let (run_time, res) = run_time_func (fun () -> initial_states |> map_parallel(fun (x,y) -> simulate (runge_kutta_4 (adapt accel)) t0 t_fin delta x y))

printfn "RK4 multi: %f" run_time.TotalSeconds;

done;

T his posting is provided "AS IS" with no warranties, and confers no rights.

F# Performance Tweaking

Additional resources