From 6168e9d34036bb6586be9ecb4f67f15401471aa0 Mon Sep 17 00:00:00 2001 From: Markus Fritsche Date: Tue, 28 Apr 2026 15:05:27 +0200 Subject: [PATCH 16/29] bes2600: gate PM indication completion on pending request and track chip state When mac80211 toggles PSM on the BES2600, the host sends WSM set_pm and waits up to 5 s on bes_power.pm_enter_cmpl for a firmware-side PM-changed indication confirming the transition. Three sequenced flaws make the wait-and-confirm racy and leave host/chip bookkeeping desynced when anything misfires: 1) bes2600_pwr_notify_ps_changed() unconditionally fires complete(pm_enter_cmpl) for any non-active psmode. It does not check whether a host-initiated set_pm is actually pending. A spontaneous indication (firmware-internal coex move, idle-driven aging) primes the completion, and the next host- driven enter_lp_mode sees a false success on its first wait_for_completion_timeout. 2) The wait/reinit ordering in bes2600_pwr_enter_lp_mode is status = wait_for_completion_timeout(...); atomic_set(pm_set_in_process, 0); reinit_completion(...); If an indication arrives between wait_for_completion_timeout returning with status==1 and reinit_completion, the next enter_lp_mode iteration's wait can also see false success. The reinit must happen *before* we start the new request, not after handling the previous one. 3) On wait_pm_ind timeout, the driver returns -ETIMEDOUT and walks away. It does not record that the firmware's actual PM state is no longer known to the host. Subsequent wake paths (gpio_wake / sbus_active) assume the chip is still active and hit deterministic SDIO failures when the firmware has transitioned anyway. This patch is the safe-prerequisite half of a wider fix: * bes_pwr.h gains enum bes2600_chip_pm_state {ACTIVE, LP, UNKNOWN} and bes_power.chip_pm_state. Its job is to track what the host has *seen the firmware confirm*, not what the host has requested. Initialised to ACTIVE in bes2600_pwr_init(). * bes2600_pwr_notify_ps_changed() unconditionally updates chip_pm_state on every indication, but only fires complete(pm_enter_cmpl) when atomic_cmpxchg(pm_set_in_process, 1, 0) succeeds. A spontaneous indication can no longer prime a waiter that will only set up its request afterwards. * bes2600_pwr_enter_lp_mode() now reinit_completion()s before setting pm_set_in_process and sending wsm_set_pm. After a timeout, it cmpxchgs pm_set_in_process back to 0 (so a late indication cannot prime the next iteration) and on the win- cmpxchg branch records chip_pm_state=UNKNOWN. A follow-up patch consumes chip_pm_state on the wake side (bes2600_pwr_device_exit_lp_mode + bes2600_gpio_wakeup_mcu) to fix the deterministic "active mcu fail" cycle this state-record enables a fix for. Splitting the work this way keeps the lock-free race fix small and reviewable on its own. No new locks, no behaviour change on the success path. Only the recovery path (timeout + spontaneous indication) gains correctness. Signed-off-by: Markus Fritsche --- bes2600/bes_pwr.c | 94 ++++++++++++++++++++++++++++++++++++++++++----- bes2600/bes_pwr.h | 15 ++++++++ 2 files changed, 100 insertions(+), 9 deletions(-) diff --git a/drivers/staging/bes2600/bes_pwr.c b/drivers/staging/bes2600/bes_pwr.c index 474b6f1..9b4a4de 100644 --- a/drivers/staging/bes2600/bes_pwr.c +++ b/drivers/staging/bes2600/bes_pwr.c @@ -524,7 +524,17 @@ static int bes2600_pwr_enter_lp_mode(struct bes2600_common *hw_priv) bes_devel("%s, psMode:%s, fastPsmIdlePeriod:%d apPsmChangePeriod:%d minAutoPsPollPeriod:%d\n", __func__, bes2600_get_ps_mode_str(priv->powersave_mode.pmMode), priv->powersave_mode.fastPsmIdlePeriod, priv->powersave_mode.apPsmChangePeriod, priv->powersave_mode.minAutoPsPollPeriod); + /* + * Reinit BEFORE the WSM goes out, so a stale + * indication from a previous cycle cannot have + * primed pm_enter_cmpl. From here until the + * indication callback's cmpxchg(1->0) on + * pm_set_in_process, only the indication for + * THIS request can complete the wait. + */ + reinit_completion(&hw_priv->bes_power.pm_enter_cmpl); atomic_set(&hw_priv->bes_power.pm_set_in_process, 1); + ret = bes2600_set_pm(priv, &priv->powersave_mode); if (ret) { atomic_set(&hw_priv->bes_power.pm_set_in_process, 0); @@ -535,11 +545,33 @@ static int bes2600_pwr_enter_lp_mode(struct bes2600_common *hw_priv) /* wait power save mode changed indication */ status = wait_for_completion_timeout(&hw_priv->bes_power.pm_enter_cmpl, 5 * HZ); - atomic_set(&hw_priv->bes_power.pm_set_in_process, 0); - reinit_completion(&hw_priv->bes_power.pm_enter_cmpl); if (!status) { - bes_devel("%s, wait pm ind timeout\n", __func__); - timeouts++; + /* + * The indication callback only fires + * complete() when it observes + * pm_set_in_process == 1; cmpxchg it + * to 0 here so a late indication + * cannot prime the next wait. + * + * If we win the cmpxchg, this is a + * real timeout: the firmware's PS + * state is unknown to us. Mark it as + * such so the next wake path can + * probe before assuming the chip is + * still active. + * + * If we lose the cmpxchg, the + * indication arrived between the + * wait timing out and us getting + * here; treat as success. + */ + if (atomic_cmpxchg(&hw_priv->bes_power.pm_set_in_process, + 1, 0) == 1) { + bes_devel("%s, wait pm ind timeout\n", __func__); + atomic_set(&hw_priv->bes_power.chip_pm_state, + BES2600_CHIP_PM_UNKNOWN); + timeouts++; + } } } else { bes_devel("skip enter lp mode\n"); @@ -554,10 +586,34 @@ static int bes2600_pwr_enter_lp_mode(struct bes2600_common *hw_priv) * in an inconsistent state that cascades into SDIO TX errors on * the BES2600. */ - if (timeouts == 0) + if (timeouts == 0) { bes2600_pwr_device_enter_lp_mode(hw_priv); - else + } else { + /* + * device_enter_lp_mode() was skipped (one or more VIFs + * timed out waiting for the firmware indication) so its + * gpio_sleep(MCU) - which drops the wake-flag bit and, if + * no other subsystem holds the wake, drives the GPIO low - + * never ran. Without it the bit stays asserted, and the + * next bes2600_pwr_device_exit_lp_mode() calls + * gpio_wake(MCU) into a "bit already set" no-op: the GPIO + * never re-edges, sbus_active() exhausts its 200x2ms + * MCU_WAKEUP_READY budget against an unwoken chip, and + * the first TX after idle stalls for several seconds. + * + * Drop the MCU wake-flag bit explicitly here so the next + * wake injects a real GPIO edge. gpio_allow_mcu_sleep + * preserves multi-subsystem semantics: it only drives the + * GPIO low when no other subsystem still holds wake; if + * BT or another holder is keeping the chip awake, the + * GPIO stays high and the bit clear here is purely + * bookkeeping (so the next gpio_wake doesn't no-op). + */ + if (hw_priv->sbus_ops->gpio_sleep) + hw_priv->sbus_ops->gpio_sleep(hw_priv->sbus_priv, + GPIO_WAKE_FLAG_MCU); ret = -ETIMEDOUT; + } return ret; } @@ -833,6 +889,7 @@ void bes2600_pwr_init(struct bes2600_common *hw_priv) hw_priv->bes_power.power_up_task = NULL; mutex_init(&hw_priv->bes_power.pwr_mutex); atomic_set(&hw_priv->bes_power.dev_state, 0); + atomic_set(&hw_priv->bes_power.chip_pm_state, BES2600_CHIP_PM_UNKNOWN); init_completion(&hw_priv->bes_power.pm_enter_cmpl); sema_init(&hw_priv->bes_power.sync_lock, 1); device_set_wakeup_capable(hw_priv->pdev, true); @@ -1213,9 +1270,28 @@ int bes2600_pwr_clear_busy_event(struct bes2600_common *hw_priv, u32 event) void bes2600_pwr_notify_ps_changed(struct bes2600_common *hw_priv, u8 psmode) { - if((psmode & 0x01) != WSM_PSM_ACTIVE) { - bes_devel("complete pm_enter_cmpl\n"); - complete(&hw_priv->bes_power.pm_enter_cmpl); + /* + * The firmware sends a PM-changed indication for every transition, + * including ones we didn't ask for (firmware-internal coex moves, + * idle-driven aging). Update chip_pm_state unconditionally so the + * wake path can use it, but only fire pm_enter_cmpl when a host- + * initiated set_pm is actually in flight - otherwise a stale + * indication can prime a future wait against a freshly + * reinit_completion()'ed state. + */ + if ((psmode & 0x01) != WSM_PSM_ACTIVE) { + atomic_set(&hw_priv->bes_power.chip_pm_state, + BES2600_CHIP_PM_LP); + if (atomic_cmpxchg(&hw_priv->bes_power.pm_set_in_process, + 1, 0) == 1) { + bes_devel("complete pm_enter_cmpl\n"); + complete(&hw_priv->bes_power.pm_enter_cmpl); + } else { + bes_devel("PM ind (LP) without pending wait; state recorded\n"); + } + } else { + atomic_set(&hw_priv->bes_power.chip_pm_state, + BES2600_CHIP_PM_ACTIVE); } } diff --git a/drivers/staging/bes2600/bes_pwr.h b/drivers/staging/bes2600/bes_pwr.h index 1ba866c..6bc44ac 100644 --- a/drivers/staging/bes2600/bes_pwr.h +++ b/drivers/staging/bes2600/bes_pwr.h @@ -64,6 +64,20 @@ enum power_down_state POWER_DOWN_STATE_UNLOCKED, }; +/* + * Confirmed PM state of the firmware-side chip. Tracks what the host + * has *seen* the firmware acknowledge, not what the host has + * requested. UNKNOWN means a host-initiated transition timed out + * before the firmware indication arrived; the next wake path should + * treat it as "we don't know" and probe before issuing GPIO/SDIO + * wakeup ops. + */ +enum bes2600_chip_pm_state { + BES2600_CHIP_PM_ACTIVE = 0, + BES2600_CHIP_PM_LP, + BES2600_CHIP_PM_UNKNOWN, +}; + typedef void (*bes_pwr_enter_lp_cb)(struct bes2600_common *hw_priv); typedef void (*bes_pwr_exit_lp_cb)(struct bes2600_common *hw_priv); @@ -106,6 +120,7 @@ struct bes2600_pwr_t bool ap_lp_bad; struct bes2600_pwr_event_t pwr_events[BES2600_DELAY_EVENT_NUM]; atomic_t pm_set_in_process; + atomic_t chip_pm_state; }; #ifdef CONFIG_BES2600_WOWLAN -- 2.54.0