⚡️ FTMotion optimized timing (#28187)

Co-authored-by: Scott Lahteine <thinkyhead@users.noreply.github.com>
2025-12-28 10:20:36 -07:00 · 2025-12-02 22:35:54 +01:00 · 2025-12-02 22:35:54 +01:00 · 14bed5aee8
commit 14bed5aee8
parent 0265975178
11 changed files with 249 additions and 232 deletions
--- a/Marlin/Configuration_adv.h
+++ b/Marlin/Configuration_adv.h
@ -1162,7 +1162,10 @@
  //#define FT_MOTION_MENU                      // Provide a MarlinUI menu to set M493 and M494 parameters
  //#define FTM_HOME_AND_PROBE                  // Use FT Motion for homing / probing. Disable if FT Motion breaks these functions.

-  #define FTM_DEFAULT_DYNFREQ_MODE dynFreqMode_DISABLED // Default mode of dynamic frequency calculation. (DISABLED, Z_BASED, MASS_BASED)
+  //#define FTM_DYNAMIC_FREQ                    // Enable for linear adjustment of XY shaping frequency according to Z or E
+  #if ENABLED(FTM_DYNAMIC_FREQ)
+    #define FTM_DEFAULT_DYNFREQ_MODE dynFreqMode_DISABLED // Default mode of dynamic frequency calculation. (DISABLED, Z_BASED, MASS_BASED)
+  #endif

  #define FTM_DEFAULT_SHAPER_X      ftMotionShaper_NONE // Default shaper mode on X axis (NONE, ZV, ZVD, ZVDD, ZVDDD, EI, 2HEI, 3HEI, MZV)
  #define FTM_SHAPING_DEFAULT_FREQ_X   37.0f    // (Hz) Default peak frequency used by input shapers
@ -1219,7 +1222,6 @@
  #define FTM_BUFFER_SIZE             128   // Window size for trajectory generation, must be a power of 2 (e.g 64, 128, 256, ...)
                                            // The total buffered time in seconds is (FTM_BUFFER_SIZE/FTM_FS)
  #define FTM_FS                     1000   // (Hz) Frequency for trajectory generation.
-  #define FTM_STEPPER_FS        2'000'000   // (Hz) Time resolution of stepper I/O update. Shouldn't affect CPU much (slower board testing needed)
  #define FTM_MIN_SHAPE_FREQ           20   // (Hz) Minimum shaping frequency, lower consumes more RAM

 #endif // FT_MOTION
--- a/Marlin/src/HAL/STM32/timers.h
+++ b/Marlin/src/HAL/STM32/timers.h
@ -51,7 +51,7 @@
 #define TEMP_TIMER_FREQUENCY 1000   // Temperature::isr() is expected to be called at around 1kHz

 // TODO: get rid of manual rate/prescale/ticks/cycles taken for procedures in stepper.cpp
-#define STEPPER_TIMER_RATE 2000000 // 2 Mhz
+#define STEPPER_TIMER_RATE 2'000'000 // 2 Mhz
 extern uint32_t GetStepperTimerClkFreq();
 #define STEPPER_TIMER_PRESCALE (GetStepperTimerClkFreq() / (STEPPER_TIMER_RATE))
 #define STEPPER_TIMER_TICKS_PER_US ((STEPPER_TIMER_RATE) / 1000000UL)               // (MHz) Stepper Timer ticks per µs
--- a/Marlin/src/inc/Conditionals-5-post.h
+++ b/Marlin/src/inc/Conditionals-5-post.h
@ -3687,9 +3687,7 @@

 // Fixed-Time Motion
 #if ENABLED(FT_MOTION)
-  #define FTM_TS (1.0f / FTM_FS)                                    // (s) Time step for trajectory generation. (Reciprocal of FTM_FS)
-  #define FTM_STEPS_PER_UNIT_TIME (FTM_STEPPER_FS / FTM_FS)         // Interpolated stepper commands per unit time
-  #define FTM_MIN_TICKS ((STEPPER_TIMER_RATE) / (FTM_STEPPER_FS))   // Minimum stepper ticks between steps
+  #define FTM_TS (1.0f / FTM_FS)                      // (s) Time step for trajectory generation. (Reciprocal of FTM_FS)
  #define FTM_RATIO (FTM_FS / FTM_MIN_SHAPE_FREQ)     // Factor for use in FTM_ZMAX. DON'T CHANGE.
  #define FTM_SMOOTH_MAX_I uint32_t(TERN0(FTM_SMOOTHING, CEIL(FTM_FS * FTM_MAX_SMOOTHING_TIME))) // Max delays for smoothing
  #define FTM_ZMAX (FTM_RATIO * 2 + FTM_SMOOTH_MAX_I) // Maximum delays for shaping functions (even numbers only!)
--- a/Marlin/src/module/ft_motion.cpp
+++ b/Marlin/src/module/ft_motion.cpp
@ -83,13 +83,6 @@ TrajectoryType FTMotion::trajectoryType = TrajectoryType::FTM_TRAJECTORY_TYPE;
 // Resonance Test
 TERN_(FTM_RESONANCE_TEST,ResonanceGenerator FTMotion::rtg;) // Resonance trajectory generator instance

-// Compact plan buffer
-stepper_plan_t FTMotion::stepper_plan_buff[FTM_BUFFER_SIZE];
-XYZEval<int64_t> FTMotion::curr_steps_q32_32 = {0};
-
-uint32_t FTMotion::stepper_plan_tail = 0,           // The index to consume from
-         FTMotion::stepper_plan_head = 0;           // The index to produce into
-
 #if FTM_HAS_LIN_ADVANCE
  bool FTMotion::use_advance_lead;
 #endif
@ -191,7 +184,7 @@ void FTMotion::loop() {

  // Set busy status for use by planner.busy()
  const bool oldBusy = busy;
-  busy = stepping.bresenham_iterations_pending > 0 || !stepper_plan_is_empty();
+  busy = stepping.is_busy();
  if (oldBusy && !busy) moving_axis_flags.reset();

 }
@ -234,15 +227,9 @@ void FTMotion::reset() {
  const bool did_suspend = stepper.suspend();
  endPos_prevBlock.reset();
  tau = 0;
-  stepper_plan_tail = stepper_plan_head = 0;
  stepping.reset();
-  curr_steps_q32_32.reset();
-
-  #if HAS_FTM_SHAPING
-    #define _RESET_ZI(A) ZERO(shaping.A.d_zi);
-    SHAPED_MAP(_RESET_ZI);
-    shaping.zi_idx = 0;
-  #endif
+  shaping.reset();
+  TERN_(FTM_SMOOTHING, smoothing.reset(););

  TERN_(HAS_EXTRUDERS, prev_traj_e = 0.0f);  // Reset linear advance variables.
  TERN_(DISTINCT_E_FACTORS, block_extruder_axis = E_AXIS);
@ -514,49 +501,12 @@ xyze_float_t FTMotion::calc_traj_point(const float dist) {
  return traj_coords;
 }

-stepper_plan_t FTMotion::calc_stepper_plan(xyze_float_t traj_coords) {
-  // 1) Convert trajectory to step delta
-  #define _TOSTEPS_q32(A, B) int64_t(traj_coords.A * planner.settings.axis_steps_per_mm[B] * (1ULL << 32))
-  XYZEval<int64_t> next_steps_q32_32 = LOGICAL_AXIS_ARRAY(
-    _TOSTEPS_q32(e, block_extruder_axis),
-    _TOSTEPS_q32(x, X_AXIS), _TOSTEPS_q32(y, Y_AXIS), _TOSTEPS_q32(z, Z_AXIS),
-    _TOSTEPS_q32(i, I_AXIS), _TOSTEPS_q32(j, J_AXIS), _TOSTEPS_q32(k, K_AXIS),
-    _TOSTEPS_q32(u, U_AXIS), _TOSTEPS_q32(v, V_AXIS), _TOSTEPS_q32(w, W_AXIS)
-  );
-  #undef _TOSTEPS_q32
-
-  constexpr uint32_t ITERATIONS_PER_TRAJ_INV_uq0_32 = (1ULL << 32) / ITERATIONS_PER_TRAJ;
-  stepper_plan_t stepper_plan;
-
-  #define _RUN_AXIS(A) do{                                                                                   \
-      int64_t delta_q32_32 = (next_steps_q32_32.A - curr_steps_q32_32.A);                                    \
-      /* 2) Set stepper plan direction bits */                                                               \
-      int16_t sign = (delta_q32_32 > 0) - (delta_q32_32 < 0);                                                \
-      stepper_plan.dir_bits.A = delta_q32_32 > 0;                                                            \
-      /* 3) Set per-iteration advance dividend Q0.32 */                                                      \
-      uint64_t delta_uq32_32 = ABS(delta_q32_32);                                                            \
-      /* dividend = delta_q32_32 / ITERATIONS_PER_TRAJ, but avoiding division and an intermediate int128 */  \
-      /* Note the integer part would overflow if there is eq or more than 1 steps per isr */                 \
-      uint32_t integer_part = (delta_uq32_32 >> 32) * ITERATIONS_PER_TRAJ_INV_uq0_32;                        \
-      uint32_t fractional_part = ((delta_uq32_32 & UINT32_MAX) * ITERATIONS_PER_TRAJ_INV_uq0_32) >> 32;      \
-      stepper_plan.advance_dividend_q0_32.A = integer_part + fractional_part;                                \
-      /* 4) Advance curr_steps by the exact integer steps that Bresenham will emit */                        \
-      /* It's like doing current_steps = next_steps, but considering any fractional error */                 \
-      /* from the dividend. This way there can be no drift. */                                               \
-      curr_steps_q32_32.A += (int64_t)stepper_plan.advance_dividend_q0_32.A * sign * ITERATIONS_PER_TRAJ;    \
-    } while(0);
-  LOGICAL_AXIS_MAP(_RUN_AXIS);
-  #undef _RUN_AXIS
-
-  return stepper_plan;
-}
-
 /**
 * Generate stepper data of the trajectory.
 * Called from FTMotion::loop()
 */
 void FTMotion::fill_stepper_plan_buffer() {
-  while (!stepper_plan_is_full()) {
+  while (!stepping.is_full()) {
    float total_duration = currentGenerator->getTotalDuration(); // If the current plan is empty, it will have zero duration.
    while (tau + FTM_TS > total_duration) {
      /**
@ -581,10 +531,8 @@ void FTMotion::fill_stepper_plan_buffer() {
    // Get distance from trajectory generator
    xyze_float_t traj_coords = calc_traj_point(currentGenerator->getDistanceAtTime(tau));

-    stepper_plan_t plan = calc_stepper_plan(traj_coords);
-
-    // Store in buffer
-    enqueue_stepper_plan(plan);
+    // Calculate and store stepper plan in buffer
+    stepping_enqueue(traj_coords);

  }
 }
--- a/Marlin/src/module/ft_motion.h
+++ b/Marlin/src/module/ft_motion.h
@ -44,7 +44,7 @@

 #define FTM_VERSION   2   // Change version when hosts need to know

-#if HAS_X_AXIS && (HAS_Z_AXIS || HAS_EXTRUDERS)
+#if ENABLED(FTM_DYNAMIC_FREQ)
  #define HAS_DYNAMIC_FREQ 1
  #if HAS_Z_AXIS
    #define HAS_DYNAMIC_FREQ_MM 1
@ -191,29 +191,20 @@ class FTMotion {
      return cfg.active ? axis_move_dir[axis] : stepper.last_direction_bits[axis];
    }

+    // A frame of the stepping plan
    static stepping_t stepping;
-    FORCE_INLINE static bool stepper_plan_is_empty() {
-      return stepper_plan_head == stepper_plan_tail;
-    }
-    FORCE_INLINE static bool stepper_plan_is_full() {
-      return ((stepper_plan_head + 1) & FTM_BUFFER_MASK) == stepper_plan_tail;
-    }
-    FORCE_INLINE static uint32_t stepper_plan_count() {
-      return (stepper_plan_head - stepper_plan_tail) & FTM_BUFFER_MASK;
-    }
-    // Enqueue a plan
-    FORCE_INLINE static void enqueue_stepper_plan(const stepper_plan_t& d) {
-      stepper_plan_buff[stepper_plan_head] = d;
-      stepper_plan_head = (stepper_plan_head + 1u) & FTM_BUFFER_MASK;
-    }
-    // Dequeue a plan.
-    // Zero-copy consume; caller must use it before next dequeue if they keep a ref.
-    // Done like this to avoid double copy.
-    // e.g do: stepper_plan_t data = dequeue_stepper_plan(); this is ok
-    FORCE_INLINE static stepper_plan_t& dequeue_stepper_plan() {
-      const uint32_t i = stepper_plan_tail;
-      stepper_plan_tail = (i + 1u) & FTM_BUFFER_MASK;
-      return stepper_plan_buff[i];
+
+    // Add a single set of coordinates in the stepping plan
+    FORCE_INLINE static void stepping_enqueue(const xyze_float_t traj_coords) {
+      #define _TOSTEPS_q16(A, B) int64_t(traj_coords.A * planner.settings.axis_steps_per_mm[B] * (1ULL << 16))
+      XYZEval<int64_t> next_steps_q48_16 = LOGICAL_AXIS_ARRAY(
+        _TOSTEPS_q16(e, block_extruder_axis),
+        _TOSTEPS_q16(x, X_AXIS), _TOSTEPS_q16(y, Y_AXIS), _TOSTEPS_q16(z, Z_AXIS),
+        _TOSTEPS_q16(i, I_AXIS), _TOSTEPS_q16(j, J_AXIS), _TOSTEPS_q16(k, K_AXIS),
+        _TOSTEPS_q16(u, U_AXIS), _TOSTEPS_q16(v, V_AXIS), _TOSTEPS_q16(w, W_AXIS)
+      );
+      #undef _TOSTEPS_q16
+      stepping.enqueue(next_steps_q48_16);
    }

  private:
@ -262,12 +253,7 @@ class FTMotion {
    static void plan_runout_block();
    static void fill_stepper_plan_buffer();
    static xyze_float_t calc_traj_point(const float dist);
-    static stepper_plan_t calc_stepper_plan(xyze_float_t delta);
    static bool plan_next_block();
-    // stepper_plan buffer variables.
-    static stepper_plan_t stepper_plan_buff[FTM_BUFFER_SIZE];
-    static uint32_t stepper_plan_tail, stepper_plan_head;
-    static XYZEval<int64_t> curr_steps_q32_32;

 }; // class FTMotion

--- a/Marlin/src/module/ft_motion/resonance_generator.cpp
+++ b/Marlin/src/module/ft_motion/resonance_generator.cpp
@ -53,7 +53,7 @@ void ResonanceGenerator::reset() {
 void ResonanceGenerator::fill_stepper_plan_buffer() {
  xyze_float_t traj_coords = {};

-  while (!ftMotion.stepper_plan_is_full()) {
+  while (!ftMotion.stepping.is_full()) {
    // Calculate current frequency
    // Logarithmic approach with duration per octave
    const float freq = rt_params.min_freq * powf(2.0f, rt_time / rt_params.octave_duration);
@ -77,9 +77,8 @@ void ResonanceGenerator::fill_stepper_plan_buffer() {
    #define _SET_TRAJ(A) traj_coords.A = rt_params.start_pos.A + (rt_params.axis == A##_AXIS ? pos_offset : 0.0f);
    LOGICAL_AXIS_MAP(_SET_TRAJ);

-    stepper_plan_t plan = ftMotion.calc_stepper_plan(traj_coords);
    // Store in buffer
-    ftMotion.enqueue_stepper_plan(plan);
+    ftMotion.stepping_enqueue(traj_coords);
    // Increment time for the next point
    rt_time += FTM_TS;
  }
--- a/Marlin/src/module/ft_motion/shaping.h
+++ b/Marlin/src/module/ft_motion/shaping.h
@ -114,4 +114,9 @@ typedef struct Shaping {
  // Of course things can't be done in the past, so when shaping is applied, the all axes are delayed by largest_delay_samples
  // minus their own centroid delay. This makes them all be equally delayed and therefore in synch.
  void refresh_largest_delay_samples() { largest_delay_samples = -_MIN(SHAPED_LIST(X.Ni[0], Y.Ni[0], Z.Ni[0], E.Ni[0])); }
+  void reset(){
+      #define _RESET_ZI(A) ZERO(A.d_zi);
+      SHAPED_MAP(_RESET_ZI);
+      zi_idx = 0;
+    }
 } shaping_t;
--- a/Marlin/src/module/ft_motion/smoothing.h
+++ b/Marlin/src/module/ft_motion/smoothing.h
@ -44,4 +44,9 @@ typedef struct Smoothing {
  // is done by delaying all axes to match the laggiest one (i.e largest_delay_samples).
  void refresh_largest_delay_samples() { largest_delay_samples = _MAX(CARTES_LIST(X.delay_samples, Y.delay_samples, Z.delay_samples, E.delay_samples)); }
  // Note: the delay equals smoothing_time iff the input signal frequency is lower than 1/smoothing_time, luckily for us, this holds in this case
+  void reset() {
+    #define _CLEAR(A) ZERO(A.smoothing_pass);
+    LOGICAL_AXIS_MAP(_CLEAR);
+    #undef _CLEAR
+  }
 } smoothing_t;
--- a/Marlin/src/module/ft_motion/stepping.cpp
+++ b/Marlin/src/module/ft_motion/stepping.cpp
@ -1,115 +0,0 @@
-/**
- * Marlin 3D Printer Firmware
- * Copyright (c) 2025 MarlinFirmware [https://github.com/MarlinFirmware/Marlin]
- *
- * Based on Sprinter and grbl.
- * Copyright (c) 2011 Camiel Gubbels / Erik van der Zalm
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <https://www.gnu.org/licenses/>.
- *
- */
-
-#include "../../inc/MarlinConfigPre.h"
-
-#if ENABLED(FT_MOTION)
-
-#include "stepping.h"
-#include "../ft_motion.h"
-
-void Stepping::reset() {
-  stepper_plan.reset();
-  delta_error_q32.set(LOGICAL_AXIS_LIST_1(1UL << 31)); // Start as 0.5 in q32 so steps are rounded
-  step_bits = 0;
-  bresenham_iterations_pending = 0;
-}
-
-uint32_t Stepping::advance_until_step() {
-  xyze_ulong_t space_q32 = -delta_error_q32 + UINT32_MAX; // How much accumulation until a step in any axis is ALMOST due
-                                                          // TODO: Add operators to types.h for scalar destination.
-
-  xyze_ulong_t advance_q32 = stepper_plan.advance_dividend_q0_32;
-  uint32_t iterations = bresenham_iterations_pending;
-  // Per-axis lower-bound approx of floor(space_q32/adv), min across axes (lower bound because this fast division underestimates result by up to 1)
-  //#define RUN_AXIS(A) if (advance_q32.A > 0) NOMORE(iterations, space_q32.A / advance_q32.A);
-  #define RUN_AXIS(A) NOMORE(iterations, uint32_t((uint64_t(space_q32.A) * advance_dividend_reciprocal.A) >> 32));
-  LOGICAL_AXIS_MAP(RUN_AXIS);
-  #undef RUN_AXIS
-
-  #define RUN_AXIS(A) delta_error_q32.A += advance_q32.A * iterations;
-  LOGICAL_AXIS_MAP(RUN_AXIS);
-  #undef RUN_AXIS
-
-  bresenham_iterations_pending -= iterations;
-  step_bits = 0;
-  // iterations may be underestimated by 1 by the cheap division, therefore we may have to do 2 iterations here
-  while (bresenham_iterations_pending && !(bool)step_bits) {
-    iterations++;
-    bresenham_iterations_pending--;
-    #define RUN_AXIS(A) do{                                                      \
-        delta_error_q32.A += stepper_plan.advance_dividend_q0_32.A;              \
-        step_bits.A = delta_error_q32.A < stepper_plan.advance_dividend_q0_32.A; \
-      }while(0);
-    LOGICAL_AXIS_MAP(RUN_AXIS);
-    #undef RUN_AXIS
-  }
-
-  return iterations * INTERVAL_PER_ITERATION;
-}
-
-uint32_t Stepping::plan() {
-  uint32_t intervals = 0;
-  if (bresenham_iterations_pending > 0) {
-    intervals = advance_until_step();
-    if (bool(step_bits)) return intervals; // steps to make => return the wait time so it gets done in due time
-    // Else all Bresenham iterations were advanced without steps => this is just the frame end, so plan the next one directly and accumulate the wait
-  }
-
-  if (ftMotion.stepper_plan_is_empty()) {
-    bresenham_iterations_pending = 0;
-    step_bits = 0;
-    return INTERVAL_PER_TRAJ_POINT;
-  }
-
-  AxisBits old_dir_bits = stepper_plan.dir_bits;
-  stepper_plan = ftMotion.dequeue_stepper_plan();
-  const AxisBits dir_flip_mask = old_dir_bits ^ stepper_plan.dir_bits;  // axes that must toggle now
-  if (dir_flip_mask) {
-    #define _HANDLE_DIR_CHANGES(A) if (dir_flip_mask.A) delta_error_q32.A *= -1;
-    LOGICAL_AXIS_MAP(_HANDLE_DIR_CHANGES);
-    #undef _HANDLE_DIR_CHANGES
-  }
-
-  if (stepper_plan.advance_dividend_q0_32 == 0) {
-    // Don't waste time in zero motion traj points
-    bresenham_iterations_pending = 0;
-    step_bits = 0;
-    return INTERVAL_PER_TRAJ_POINT;
-  }
-
-  // This vector division is unavoidable, but it saves a division per step during Bresenham
-  // The reciprocal is actually 2^32/dividend, but that requires dividing a uint64_t, which quite expensive
-  // Since even the real reciprocal may underestimate the quotient by 1 anyway already, this optimisation doesn't
-  // make things worse. This underestimation is compensated for in advance_until_step.
-  #define _DIVIDEND_RECIP(A) do{ \
-    const uint32_t d = stepper_plan.advance_dividend_q0_32.A; \
-    advance_dividend_reciprocal.A = d ? UINT32_MAX / d : UINT32_MAX; \
-  }while(0);
-  LOGICAL_AXIS_MAP(_DIVIDEND_RECIP);
-  #undef _DIVIDEND_RECIP
-
-  bresenham_iterations_pending = ITERATIONS_PER_TRAJ;
-  return intervals + advance_until_step();
-}
-
-#endif // FT_MOTION
--- a/Marlin/src/module/ft_motion/stepping.h
+++ b/Marlin/src/module/ft_motion/stepping.h
@ -23,31 +23,222 @@

 #include "../../inc/MarlinConfig.h"

+//
+// uint64-free equivalent of: ((uint64_t)a * b) >> 16
+//
+FORCE_INLINE constexpr uint32_t a_times_b_shift_16(uint32_t a, uint32_t b) {
+  uint32_t hi = a >> 16;
+  uint32_t lo = a & 0xFFFFu;
+  return (hi * b) + ((lo * b) >> 16);
+}
+#define FTM_NEVER (UINT16_MAX)                                       // Reserved number to indicate "no ticks in this frame" (FRAME_TICKS_FP+1 would work too)
+constexpr uint32_t FRAME_TICKS = STEPPER_TIMER_RATE / FTM_FS;        // Timer ticks in a frame
+static_assert(FRAME_TICKS < FTM_NEVER, "(STEPPER_TIMER_RATE / FTM_FS) must be < 2^16 (otherwise fixed-point numbers exceed uint16 vars).");
+constexpr uint32_t FTM_Q_INT = 32u - __builtin_clz(FRAME_TICKS + 1); // Bits to represent the max value (duration of a frame, +1 one for FTM_NEVER).
+                                                                     // "clz" counts leading zeroes.
+constexpr uint32_t FTM_Q = 16 - FTM_Q_INT;                           // uint16 interval fractional bits.
+                                                                     // Intervals buffer has fixed point numbers with the point on this position
+
+// The _FP and _fp suffixes mean the number is in fixed point format with the point at the FTM_Q position.
+// See: https://en.wikipedia.org/wiki/Fixed-point_arithmetic
+// E.g number_fp = number << FTM_Q
+//     number == (number_fp >> FTM_Q)
+constexpr uint32_t ONE_FP = 1 << FTM_Q;                   // Number 1 in fixed point format
+constexpr uint32_t FP_FLOOR_MASK = ~(ONE_FP - 1);         // Bit mask to do FLOOR in fixed point
+constexpr uint32_t FRAME_TICKS_FP = FRAME_TICKS << FTM_Q; // Ticks in a frame in fixed point
+
 typedef struct stepper_plan {
  AxisBits dir_bits;
-  xyze_ulong_t advance_dividend_q0_32{0};
-  void reset() { advance_dividend_q0_32.reset(); }
+  xyze_uint_t first_interval_fp;
+  xyze_uint_t interval_fp;
 } stepper_plan_t;

 // Stepping plan handles steps for a whole frame (trajectory point delta)
 typedef struct Stepping {
-  stepper_plan_t stepper_plan;
-  xyze_ulong_t advance_dividend_reciprocal{0}; // Note this 32 bit reciprocal underestimates quotients by at most one.
-  xyze_ulong_t delta_error_q32{ LOGICAL_AXIS_LIST_1(_BV32(31)) };
-  AxisBits step_bits;
-  uint32_t bresenham_iterations_pending;

-  void reset();
-  // Updates error and bresenham_iterations_pending, sets step_bits and returns interval until the next step (or end of frame).
-  uint32_t advance_until_step();
-  /**
-   * If bresenham_iterations_pending, advance to next actual step.
-   * Else, consume stepper data point
-   * Then return interval until that next step
-   */
-  uint32_t plan();
-  #define INTERVAL_PER_ITERATION (STEPPER_TIMER_RATE / FTM_STEPPER_FS)
-  #define INTERVAL_PER_TRAJ_POINT (STEPPER_TIMER_RATE / FTM_FS)
-  #define ITERATIONS_PER_TRAJ (FTM_STEPPER_FS * FTM_TS)
+  //
+  // ISR part
+  //
+
+  AxisBits dir_bits;
+  AxisBits step_bits;
+
+  xyze_ulong_t axis_interval_fp{ LOGICAL_AXIS_LIST_1(FTM_NEVER) };
+  xyze_ulong_t ticks_left_per_axis_fp{ LOGICAL_AXIS_LIST_1(FTM_NEVER) };
+  uint32_t ticks_left_in_frame_fp = 0;
+
+  // Return how many full ticks we must wait before
+  // generating the next step pulse. The call is inexpensive:
+  //  - no heap, no locks – pure arithmetic on pre-computed data
+  FORCE_INLINE uint32_t advance_until_step() {
+    step_bits = 0;
+    uint32_t ticks_to_wait_fp = 0;
+
+    for (;;) {
+      // Smallest remaining tick count among all axes – next step time
+      const uint32_t ticks_to_next_step_fp = ticks_left_per_axis_fp.small();
+
+      // Does the frame finish before this next step occurs?
+      if (ticks_to_next_step_fp > ticks_left_in_frame_fp) {
+        // Frame ends before next step
+        if (is_empty()) {
+          ticks_left_in_frame_fp = 0;
+          ticks_left_per_axis_fp = FTM_NEVER;
+          return FTM_NEVER;
+        }
+
+        // Consume the rest of this frame's time
+        const uint32_t wait_floor_fp = ticks_left_in_frame_fp & FP_FLOOR_MASK;
+        ticks_to_wait_fp += wait_floor_fp;
+        ticks_left_in_frame_fp -= wait_floor_fp;
+
+        //
+        // Pull the next plan – it already contains:
+        //  - direction bits
+        //  - first_interval_fp (time to the first step)
+        //  - interval_fp       (repeating step period)
+        //
+        const stepper_plan_t next = dequeue();
+        dir_bits         = next.dir_bits;
+        axis_interval_fp = next.interval_fp.asUInt32();
+
+        // Note the frame will actually end a fraction of a tick in the future, and ticks_left_in_frame_fp still has that fraction.
+        // Instead of discarding that time, we delay both the end of the next frame, and all first steps by that amount.
+        ticks_left_per_axis_fp  = next.first_interval_fp.asUInt32();
+        ticks_left_per_axis_fp += ticks_left_in_frame_fp;
+        ticks_left_in_frame_fp += FRAME_TICKS_FP;   // Start a fresh frame
+      }
+      else {
+        // Step happens before frame end
+        // Advance to it
+        const uint32_t wait_floor_fp = ticks_to_next_step_fp & FP_FLOOR_MASK;
+        ticks_to_wait_fp += wait_floor_fp;
+        ticks_left_in_frame_fp -= wait_floor_fp;
+        ticks_left_per_axis_fp -= wait_floor_fp;
+
+        // Build step_bits: any axis whose counter < ONE_FP should step before the next tick, so we tick now
+        // unless the frame ends earlier.
+        uint32_t limit_fp = _MIN(ONE_FP - 1, ticks_left_in_frame_fp);
+        auto _set_step_bit = [&](const AxisEnum A) __attribute__((always_inline)) {
+          if (ticks_left_per_axis_fp[A] <= limit_fp) {
+            step_bits[A] = 1;
+            ticks_left_per_axis_fp[A] += axis_interval_fp[A];
+          }
+        };
+        LOGICAL_AXIS_CALL(_set_step_bit);
+
+        return ticks_to_wait_fp >> FTM_Q;   // Convert fixed point back to whole ticks
+      }
+    } // loop forever
+  }
+
+  FORCE_INLINE void reset() {
+    step_bits = 0;
+    axis_interval_fp = FTM_NEVER;
+    ticks_left_per_axis_fp = FTM_NEVER;
+    ticks_left_in_frame_fp = 0;
+
+    stepper_plan_tail = stepper_plan_head = 0;
+    curr_steps_q48_16.reset();
+  }
+
+  //
+  // Buffering part
+  //
+
+  stepper_plan_t stepper_plan_buff[FTM_BUFFER_SIZE];
+  uint32_t stepper_plan_tail = 0, stepper_plan_head = 0;
+  XYZEval<int64_t> curr_steps_q48_16{0};
+
+  FORCE_INLINE void enqueue(XYZEval<int64_t> next_steps_q48_16) {
+
+    stepper_plan_t stepper_plan;
+    constexpr uint32_t HALF_PHASE_OFFSET = (1UL << 15); // to make steps at .5 crossings instead of integers to center the error
+
+    auto _run_axis = [&](const AxisEnum A) __attribute__((always_inline)) {
+      // Add half-phase offset to keep step error centred
+      const int64_t offset_curr_q48_16 = curr_steps_q48_16[A] + HALF_PHASE_OFFSET,
+                    offset_next_q48_16 = next_steps_q48_16[A] + HALF_PHASE_OFFSET;
+      curr_steps_q48_16[A] = next_steps_q48_16[A];
+
+      // Determine direction change
+      const bool new_dir = offset_next_q48_16 >= offset_curr_q48_16;
+      stepper_plan.dir_bits[A] = new_dir;
+
+      // Δsteps in Q16.16 format – magnitude only
+      const uint32_t delta_q16_16 = abs(offset_next_q48_16 - offset_curr_q48_16);
+
+      // Current / next phase (fractional part of the position)
+      uint32_t curr_phase_q1_16 = offset_curr_q48_16 & 0xFFFF,
+               next_phase_q1_16 = offset_next_q48_16 & 0xFFFF;
+      if (!new_dir) {
+        // When going backwards, the phase is 1-phase
+        curr_phase_q1_16 = (1UL<<16) - curr_phase_q1_16;
+        next_phase_q1_16 = (1UL<<16) - next_phase_q1_16;
+      }
+      // When going, e.g., from 0.6 to 1.0, the delta is not a whole step,
+      // but the phase overflow indicates a step.
+      const uint32_t carry = curr_phase_q1_16 > next_phase_q1_16;
+
+      // steps_to_make = integer steps + potential fraction crossing an integer
+      const uint16_t steps_to_make = (delta_q16_16 >> 16) + carry;
+
+      if (steps_to_make == 0) {                // No steps on this axis
+        stepper_plan.first_interval_fp[A] = FTM_NEVER;
+        stepper_plan.interval_fp[A]       = FTM_NEVER;
+        return;
+      }
+
+      // Compute the exact time between steps.
+      //   interval = ticks_per_frame / delta
+      //   current_frame_phase_fp = interval * curr_phase
+      const uint32_t interval_fp = (FRAME_TICKS_FP << 16) / delta_q16_16,
+                     current_frame_phase_fp = a_times_b_shift_16(interval_fp, curr_phase_q1_16);
+      uint32_t first_interval_fp = interval_fp - current_frame_phase_fp;
+
+      // The calculation of interval_fp may undershoot its value by a fraction
+      // due to integer (floor) division. This small fractional error can
+      // occasionally make a spurious step fit inside this frame.
+      // To avoid that corner case, the first interval is incremented just enough
+      // for it to not fit.
+      const uint32_t tick_of_spurious_step_fp = first_interval_fp + interval_fp * steps_to_make;
+      if (tick_of_spurious_step_fp <= FRAME_TICKS_FP) {
+        first_interval_fp += FRAME_TICKS_FP - tick_of_spurious_step_fp + 1;
+      }
+
+      stepper_plan.first_interval_fp[A] = _MIN(first_interval_fp, FTM_NEVER);
+      stepper_plan.interval_fp[A]       = _MIN(interval_fp, FTM_NEVER);
+    };
+
+    LOGICAL_AXIS_CALL(_run_axis);
+
+    // Store the plan into the circular buffer
+    stepper_plan_buff[stepper_plan_head] = stepper_plan;
+    stepper_plan_head = (stepper_plan_head + 1U) & FTM_BUFFER_MASK;
+  }
+
+  // Dequeue a plan.
+  // Zero-copy consume; caller must use it before next dequeue if they keep a ref.
+  // Done like this to avoid double copy.
+  // e.g do: stepper_plan_t data = dequeue(); this is ok
+  FORCE_INLINE stepper_plan_t& dequeue() {
+    const uint32_t i = stepper_plan_tail;
+    stepper_plan_tail = (i + 1U) & FTM_BUFFER_MASK;
+    return stepper_plan_buff[i];
+  }
+
+  //
+  // Simple helper predicates
+  //
+
+  FORCE_INLINE bool is_busy() {
+    return !(is_empty() && ticks_left_in_frame_fp == 0);
+  }
+  FORCE_INLINE bool is_empty() {
+    return stepper_plan_head == stepper_plan_tail;
+  }
+  FORCE_INLINE bool is_full() {
+    return ((stepper_plan_head + 1) & FTM_BUFFER_MASK) == stepper_plan_tail;
+  }

 } stepping_t;
--- a/Marlin/src/module/stepper.cpp
+++ b/Marlin/src/module/stepper.cpp
@ -3585,9 +3585,7 @@ void Stepper::report_positions() {
   */
  void Stepper::ftMotion_stepper() {
    AxisBits &step_bits = ftMotion.stepping.step_bits;            // Aliases for prettier code
-    AxisBits &dir_bits = ftMotion.stepping.stepper_plan.dir_bits;
-
-    if (step_bits.bits == 0) return;
+    AxisBits &dir_bits = ftMotion.stepping.dir_bits;

    USING_TIMED_PULSE();