diff --git a/examples/3D_mibm_sphere_head_on_collision/case.py b/examples/3D_mibm_sphere_head_on_collision/case.py
index 981aacb310..ec4b908ecf 100644
--- a/examples/3D_mibm_sphere_head_on_collision/case.py
+++ b/examples/3D_mibm_sphere_head_on_collision/case.py
@@ -89,6 +89,7 @@
             "prim_vars_wrt": "T",
             "E_wrt": "T",
             "parallel_io": "T",
+            "ib_state_wrt": "T",
             # Patch: Constant Tube filled with air
             # Specify the cylindrical air tube grid geometry
             "patch_icpp(1)%geometry": 9,
diff --git a/src/simulation/m_collisions.fpp b/src/simulation/m_collisions.fpp
index ac0ab630e4..85b42880eb 100644
--- a/src/simulation/m_collisions.fpp
+++ b/src/simulation/m_collisions.fpp
@@ -48,12 +48,12 @@ contains
 
     end subroutine s_initialize_collisions_module
 
-    subroutine s_apply_collision_forces(ghost_points, num_gps, ib_markers, forces, torques)
+    subroutine s_apply_collision_forces(ghost_points, num_gps, ib_markers, ib_ft)
 
         type(ghost_point), dimension(:), intent(in)    :: ghost_points
         integer, intent(in)                            :: num_gps
         type(integer_field), intent(in)                :: ib_markers
-        real(wp), dimension(num_ibs, 3), intent(inout) :: forces, torques
+        real(wp), dimension(6, num_ibs), intent(inout) :: ib_ft
         integer                                        :: num_considered_collisions
 
         ! return if no collisions
@@ -67,17 +67,17 @@ contains
 
         select case (collision_model)
         case (1)  ! soft sphere model
-            call s_apply_wall_collision_forces_soft_sphere(forces, torques)
-            call s_apply_ib_collision_forces_soft_sphere(num_considered_collisions, forces, torques)
+            call s_apply_wall_collision_forces_soft_sphere(ib_ft)
+            call s_apply_ib_collision_forces_soft_sphere(num_considered_collisions, ib_ft)
         end select
 
     end subroutine s_apply_collision_forces
 
     !> @brief applies collision forces to IBs assuming a soft-sphere collision model (all IBs are circles or spheres)
-    subroutine s_apply_ib_collision_forces_soft_sphere(num_considered_collisions, forces, torques)
+    subroutine s_apply_ib_collision_forces_soft_sphere(num_considered_collisions, ib_ft)
 
         integer, intent(in) :: num_considered_collisions
-        real(wp), dimension(num_ibs, 3), intent(inout) :: forces, torques
+        real(wp), dimension(6, num_ibs), intent(inout) :: ib_ft
         integer :: i, encoded_pid1, encoded_pid2, xp1, xp2, yp1, yp2, zp1, zp2, pid1, pid2, l  ! iterators and patch IDs
         real(wp) :: overlap_distance
         real(wp), dimension(3) :: normal_vector, centroid_1, centroid_2
@@ -93,7 +93,7 @@ contains
         $:GPU_PARALLEL_LOOP(private='[i, l, encoded_pid1, encoded_pid2, xp1, xp2, yp1, yp2, zp1, zp2, pid1, pid2, centroid_1, &
                             & centroid_2, normal_vector, overlap_distance, effective_mass, k, eta, normal_velocity, &
                             & tangental_vector, normal_force, tangental_force, torque, radial_vector, rotation_velocity, vel1, &
-                            & vel2]', copy='[forces, torques]')
+                            & vel2]', copy='[ib_ft]')
         do i = 1, num_considered_collisions
             encoded_pid1 = collision_lookup(i, 3)
             encoded_pid2 = collision_lookup(i, 4)
@@ -144,15 +144,15 @@ contains
                     do l = 1, num_dims
                         ! update the first IB
                         $:GPU_ATOMIC(atomic='update')
-                        forces(pid1, l) = forces(pid1, l) + (normal_force(l) + tangental_force(l))
+                        ib_ft(l, pid1) = ib_ft(l, pid1) + (normal_force(l) + tangental_force(l))
                         $:GPU_ATOMIC(atomic='update')
-                        torques(pid1, l) = torques(pid1, l) + torque(l)
+                        ib_ft(l + 3, pid1) = ib_ft(l + 3, pid1) + torque(l)
 
                         ! apply equal and opposite force/torque to second IB
                         $:GPU_ATOMIC(atomic='update')
-                        forces(pid2, l) = forces(pid2, l) - (normal_force(l) + tangental_force(l))
+                        ib_ft(l, pid2) = ib_ft(l, pid2) - (normal_force(l) + tangental_force(l))
                         $:GPU_ATOMIC(atomic='update')
-                        torques(pid2, l) = torques(pid2, l) + torque(l)*patch_ib(pid2)%radius/patch_ib(pid1)%radius
+                        ib_ft(l + 3, pid2) = ib_ft(l + 3, pid2) + torque(l)*patch_ib(pid2)%radius/patch_ib(pid1)%radius
                     end do
                 end if
             end if
@@ -162,9 +162,9 @@ contains
     end subroutine s_apply_ib_collision_forces_soft_sphere
 
     !> @brief applies collision forces to IBs assuming a soft-sphere collision model (all IBs are circles or spheres)
-    subroutine s_apply_wall_collision_forces_soft_sphere(forces, torques)
+    subroutine s_apply_wall_collision_forces_soft_sphere(ib_ft)
 
-        real(wp), dimension(num_ibs, 3), intent(inout) :: forces, torques
+        real(wp), dimension(6, num_ibs), intent(inout) :: ib_ft
         integer :: patch_id, i, l
         real(wp), dimension(3) :: normal_force, tangental_force, normal_vector, normal_velocity, tangental_vector, &
              & collision_location, torque, radial_vector, rotation_velocity, velocity
@@ -172,7 +172,7 @@ contains
 
         $:GPU_PARALLEL_LOOP(private='[patch_id, i, l, collision_location, normal_vector, k, eta, normal_velocity, &
                             & tangental_vector, normal_force, tangental_force, torque, radial_vector, rotation_velocity, &
-                            & velocity]', copy='[forces, torques]', collapse=2)
+                            & velocity]', copy='[ib_ft]', collapse=2)
         do patch_id = 1, num_ibs
             do i = 1, num_dims*2
                 ! only compute force contributions if there was an overlap
@@ -217,9 +217,9 @@ contains
 
                     do l = 1, num_dims
                         $:GPU_ATOMIC(atomic='update')
-                        forces(patch_id, l) = forces(patch_id, l) + (normal_force(l) + tangental_force(l))
+                        ib_ft(l, patch_id) = ib_ft(l, patch_id) + (normal_force(l) + tangental_force(l))
                         $:GPU_ATOMIC(atomic='update')
-                        torques(patch_id, l) = torques(patch_id, l) + torque(l)
+                        ib_ft(l + 3, patch_id) = ib_ft(l + 3, patch_id) + torque(l)
                     end do
                 end if
             end do
diff --git a/src/simulation/m_ibm.fpp b/src/simulation/m_ibm.fpp
index 46dbe2d6eb..2332ae91d5 100644
--- a/src/simulation/m_ibm.fpp
+++ b/src/simulation/m_ibm.fpp
@@ -47,8 +47,7 @@ module m_ibm
 
     ! IB MPI buffers
     integer, allocatable  :: send_ids(:), recv_ids(:)
-    real(wp), allocatable :: send_ft(:,:), recv_ft(:,:)
-    real(wp), allocatable :: recv_forces_snap(:,:), recv_torques_snap(:,:)
+    real(wp), allocatable :: recv_ft(:,:), recv_ft_snap(:,:)
 
 contains
 
@@ -98,9 +97,8 @@ contains
         ! allocate some arrays for MPI communication, if required by this simulation
 #ifdef MFC_MPI
         if (num_procs > 1) then
-            @:ALLOCATE(send_ids(size(patch_ib)), send_ft(6, size(patch_ib)))
-            allocate (recv_forces_snap(size(patch_ib), 3), recv_torques_snap(size(patch_ib), 3), recv_ids(size(patch_ib)), &
-                      & recv_ft(6, size(patch_ib)))
+            @:ALLOCATE(send_ids(size(patch_ib)))
+            allocate (recv_ft_snap(6, size(patch_ib)), recv_ids(size(patch_ib)), recv_ft(6, size(patch_ib)))
         end if
 #endif
 
@@ -908,7 +906,7 @@ contains
         type(scalar_field), dimension(1:sys_size), intent(in)          :: q_prim_vf
         type(physical_parameters), dimension(1:num_fluids), intent(in) :: fluid_pp
         integer                                                        :: i, j, k, l, encoded_ib_idx, ib_idx, ib_idx_temp, fluid_idx
-        real(wp), dimension(num_ibs, 3)                                :: forces, torques
+        real(wp), dimension(6, num_ibs)                                :: ib_ft
         ! viscous stress tensor with temp vectors to hold divergence calculations
         real(wp), dimension(1:3,1:3) :: viscous_stress_div, viscous_stress_div_1, viscous_stress_div_2
         real(wp), dimension(1:3)     :: local_force_contribution, radial_vector, local_torque_contribution
@@ -922,8 +920,7 @@ contains
 
         call nvtxStartRange("COMPUTE-IB-FORCES")
 
-        forces = 0._wp
-        torques = 0._wp
+        ib_ft = 0._wp
 
         if (viscous) then
             do fluid_idx = 1, num_fluids
@@ -937,8 +934,8 @@ contains
 
         $:GPU_PARALLEL_LOOP(private='[i, j, k, l, ib_idx, ib_idx_temp, encoded_ib_idx, fluid_idx, radial_vector, &
                             & local_force_contribution, cell_volume, local_torque_contribution, dynamic_viscosity, &
-                            & viscous_stress_div, viscous_stress_div_1, viscous_stress_div_2, dx, dy, dz]', copy='[forces, &
-                            & torques]', copyin='[dynamic_viscosities]', collapse=3)
+                            & viscous_stress_div, viscous_stress_div_1, viscous_stress_div_2, dx, dy, dz]', copy='[ib_ft]', &
+                            & copyin='[dynamic_viscosities]', collapse=3)
         do i = 0, m
             do j = 0, n
                 do k = 0, p
@@ -1025,9 +1022,9 @@ contains
                             ! Update the force and torque values atomically to prevent race conditions
                             do l = 1, 3
                                 $:GPU_ATOMIC(atomic='update')
-                                forces(ib_idx, l) = forces(ib_idx, l) + (local_force_contribution(l)*cell_volume)
+                                ib_ft(l, ib_idx) = ib_ft(l, ib_idx) + (local_force_contribution(l)*cell_volume)
                                 $:GPU_ATOMIC(atomic='update')
-                                torques(ib_idx, l) = torques(ib_idx, l) + local_torque_contribution(l)*cell_volume
+                                ib_ft(l + 3, ib_idx) = ib_ft(l + 3, ib_idx) + local_torque_contribution(l)*cell_volume
                             end do
                         end if  ! ib_idx > 0
                     end if
@@ -1036,29 +1033,29 @@ contains
         end do
         $:END_GPU_PARALLEL_LOOP()
 
-        call s_apply_collision_forces(ghost_points, num_gps, ib_markers, forces, torques)
+        call s_apply_collision_forces(ghost_points, num_gps, ib_markers, ib_ft)
 
         ! reduce the forces across local neighborhood ranks
-        call s_communicate_ib_forces(forces, torques)
+        call s_communicate_ib_forces(ib_ft)
 
         ! consider body forces after reducing to avoid double counting
         do i = 1, num_ibs
             if (bf_x) then
-                forces(i, 1) = forces(i, 1) + accel_bf(1)*patch_ib(i)%mass
+                ib_ft(1, i) = ib_ft(1, i) + accel_bf(1)*patch_ib(i)%mass
             end if
             if (bf_y) then
-                forces(i, 2) = forces(i, 2) + accel_bf(2)*patch_ib(i)%mass
+                ib_ft(2, i) = ib_ft(2, i) + accel_bf(2)*patch_ib(i)%mass
             end if
             if (bf_z) then
-                forces(i, 3) = forces(i, 3) + accel_bf(3)*patch_ib(i)%mass
+                ib_ft(3, i) = ib_ft(3, i) + accel_bf(3)*patch_ib(i)%mass
             end if
         end do
 
         ! apply the summed forces
-        $:GPU_PARALLEL_LOOP(private='[i]', copyin='[forces, torques]')
+        $:GPU_PARALLEL_LOOP(private='[i]', copyin='[ib_ft]')
         do i = 1, num_ibs
-            patch_ib(i)%force(:) = forces(i,:)
-            patch_ib(i)%torque(:) = torques(i,:)
+            patch_ib(i)%force(:) = ib_ft(1:3,i)
+            patch_ib(i)%torque(:) = ib_ft(4:6,i)
         end do
         $:END_GPU_PARALLEL_LOOP()
 
@@ -1247,9 +1244,9 @@ contains
     !! 2: send current (post-pass-1) values; add received; subtract recv_snap to remove double-counting of the direct contribution
     !! already added in pass 1. Back-propagation phase: 2 passes per dimension receiving from the high-index (+X) neighbor, each
     !! overwriting local forces with the neighbor's accumulated total.
-    subroutine s_communicate_ib_forces(forces, torques)
+    subroutine s_communicate_ib_forces(ib_ft)
 
-        real(wp), dimension(num_ibs, 3), intent(inout) :: forces, torques
+        real(wp), dimension(6, num_ibs), intent(inout) :: ib_ft
 
 #ifdef MFC_MPI
         integer                       :: i, j, k, pack_pos, unpack_pos, buf_size, ierr
@@ -1267,24 +1264,21 @@ contains
                 send_neighbor = merge(bc_${X}$%end, MPI_PROC_NULL, bc_${X}$%end >= 0)
                 recv_neighbor = merge(bc_${X}$%beg, MPI_PROC_NULL, bc_${X}$%beg >= 0)
 
-                recv_forces_snap = 0._wp
-                recv_torques_snap = 0._wp
+                recv_ft_snap = 0._wp
                 tag = 300
 
                 do k = 1, 2*ib_neighborhood_radius
                     ! send forces to +${X}$ neighbor; receive from -${X}$ neighbor. Add received values then
                     pack_pos = 0
-                    $:GPU_PARALLEL_LOOP(private='[i]', copyin='[forces, torques]')
+                    $:GPU_PARALLEL_LOOP(private='[i]')
                     do i = 1, num_ibs
                         send_ids(i) = patch_ib(i)%gbl_patch_id
-                        send_ft(1:3,i) = forces(i,:)
-                        send_ft(4:6,i) = torques(i,:)
                     end do
                     $:END_GPU_PARALLEL_LOOP()
-                    $:GPU_UPDATE(host='[send_ids, send_ft]')
+                    $:GPU_UPDATE(host='[send_ids]')
                     call MPI_PACK(num_ibs, 1, MPI_INTEGER, ib_force_send_buf, buf_size, pack_pos, MPI_COMM_WORLD, ierr)
                     call MPI_PACK(send_ids, num_ibs, MPI_INTEGER, ib_force_send_buf, buf_size, pack_pos, MPI_COMM_WORLD, ierr)
-                    call MPI_PACK(send_ft, 6*num_ibs, mpi_p, ib_force_send_buf, buf_size, pack_pos, MPI_COMM_WORLD, ierr)
+                    call MPI_PACK(ib_ft, 6*num_ibs, mpi_p, ib_force_send_buf, buf_size, pack_pos, MPI_COMM_WORLD, ierr)
                     call MPI_SENDRECV(ib_force_send_buf, pack_pos, MPI_PACKED, send_neighbor, tag, ib_force_recv_buf, buf_size, &
                                       & MPI_PACKED, recv_neighbor, tag, MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
 
@@ -1294,16 +1288,13 @@ contains
                         call MPI_UNPACK(ib_force_recv_buf, buf_size, unpack_pos, recv_ids, recv_count, MPI_INTEGER, &
                                         & MPI_COMM_WORLD, ierr)
                         call MPI_UNPACK(ib_force_recv_buf, buf_size, unpack_pos, recv_ft, 6*recv_count, mpi_p, MPI_COMM_WORLD, ierr)
-                        $:GPU_PARALLEL_LOOP(private='[i, j]', copyin='[recv_ft, recv_ids]', copy='[forces, torques, &
-                                            & recv_forces_snap, recv_torques_snap]')
+                        $:GPU_PARALLEL_LOOP(private='[i, j]', copyin='[recv_ft, recv_ids]', copy='[ib_ft, recv_ft_snap]')
                         do i = 1, recv_count
                             call s_get_neighborhood_idx(recv_ids(i), j)
                             if (j > 0) then
                                 ! add forces and subtract recv_snap prevent double-counting
-                                forces(j,:) = forces(j,:) + recv_ft(1:3,i) - recv_forces_snap(j,:)
-                                torques(j,:) = torques(j,:) + recv_ft(4:6,i) - recv_torques_snap(j,:)
-                                recv_forces_snap(j,:) = recv_ft(1:3,i)
-                                recv_torques_snap(j,:) = recv_ft(4:6,i)
+                                ib_ft(:,j) = ib_ft(:,j) + recv_ft(:,i) - recv_ft_snap(:,j)
+                                recv_ft_snap(:,j) = recv_ft(:,i)
                             end if
                         end do
                         $:END_GPU_PARALLEL_LOOP()
@@ -1321,17 +1312,15 @@ contains
 
                 do k = 1, 2*ib_neighborhood_radius
                     pack_pos = 0
-                    $:GPU_PARALLEL_LOOP(private='[i]', copyin='[forces, torques]')
+                    $:GPU_PARALLEL_LOOP(private='[i]')
                     do i = 1, num_ibs
                         send_ids(i) = patch_ib(i)%gbl_patch_id
-                        send_ft(1:3,i) = forces(i,:)
-                        send_ft(4:6,i) = torques(i,:)
                     end do
                     $:END_GPU_PARALLEL_LOOP()
-                    $:GPU_UPDATE(host='[send_ids, send_ft]')
+                    $:GPU_UPDATE(host='[send_ids]')
                     call MPI_PACK(num_ibs, 1, MPI_INTEGER, ib_force_send_buf, buf_size, pack_pos, MPI_COMM_WORLD, ierr)
                     call MPI_PACK(send_ids, num_ibs, MPI_INTEGER, ib_force_send_buf, buf_size, pack_pos, MPI_COMM_WORLD, ierr)
-                    call MPI_PACK(send_ft, 6*num_ibs, mpi_p, ib_force_send_buf, buf_size, pack_pos, MPI_COMM_WORLD, ierr)
+                    call MPI_PACK(ib_ft, 6*num_ibs, mpi_p, ib_force_send_buf, buf_size, pack_pos, MPI_COMM_WORLD, ierr)
                     call MPI_SENDRECV(ib_force_send_buf, pack_pos, MPI_PACKED, send_neighbor, tag, ib_force_recv_buf, buf_size, &
                                       & MPI_PACKED, recv_neighbor, tag, MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
                     if (recv_neighbor /= MPI_PROC_NULL) then
@@ -1340,12 +1329,11 @@ contains
                         call MPI_UNPACK(ib_force_recv_buf, buf_size, unpack_pos, recv_ids, recv_count, MPI_INTEGER, &
                                         & MPI_COMM_WORLD, ierr)
                         call MPI_UNPACK(ib_force_recv_buf, buf_size, unpack_pos, recv_ft, 6*recv_count, mpi_p, MPI_COMM_WORLD, ierr)
-                        $:GPU_PARALLEL_LOOP(private='[i, j]', copyin='[recv_ft, recv_ids]', copy='[forces, torques]')
+                        $:GPU_PARALLEL_LOOP(private='[i, j]', copyin='[recv_ft, recv_ids]', copy='[ib_ft]')
                         do i = 1, recv_count
                             call s_get_neighborhood_idx(recv_ids(i), j)
                             if (j > 0) then
-                                forces(j,:) = recv_ft(1:3,i)
-                                torques(j,:) = recv_ft(4:6,i)
+                                ib_ft(:,j) = recv_ft(:,i)
                             end if
                         end do
                         $:END_GPU_PARALLEL_LOOP()
@@ -1557,8 +1545,8 @@ contains
 
 #ifdef MFC_MPI
         if (num_procs > 1) then
-            @:DEALLOCATE(send_ids, send_ft)
-            deallocate (recv_forces_snap, recv_torques_snap, recv_ids, recv_ft)
+            @:DEALLOCATE(send_ids)
+            deallocate (recv_ft_snap, recv_ids, recv_ft)
         end if
 #endif
 
diff --git a/src/simulation/m_time_steppers.fpp b/src/simulation/m_time_steppers.fpp
index 7420404ef2..e8a877a8ae 100644
--- a/src/simulation/m_time_steppers.fpp
+++ b/src/simulation/m_time_steppers.fpp
@@ -566,6 +566,7 @@ contains
 
         if (ib) then
             if (moving_immersed_boundary_flag) then
+                $:GPU_UPDATE(host='[patch_ib(1:num_ibs)]')
                 call s_wrap_periodic_ibs()  ! wraps the positions of IBs to the local proc
                 call s_handoff_ib_ownership()  ! recomputes which ranks own which IBs and communicate to neighbors
             else if (ib_state_wrt) then