Compare commits

...

1 Commits

Author SHA1 Message Date
test0r c57c77e446 bes2600: gate PM indication completion on pending request and track chip state
When mac80211 toggles PSM on the BES2600, the host sends WSM set_pm
and waits up to 5 s on bes_power.pm_enter_cmpl for a firmware-side
PM-changed indication confirming the transition. Three sequenced
flaws make the wait-and-confirm racy and leave host/chip bookkeeping
desynced when anything misfires:

  1) bes2600_pwr_notify_ps_changed() unconditionally fires
     complete(pm_enter_cmpl) for any non-active psmode. It does not
     check whether a host-initiated set_pm is actually pending. A
     spontaneous indication (firmware-internal coex move,
     idle-driven aging) primes the completion, and the next host-
     driven enter_lp_mode sees a false success on its first
     wait_for_completion_timeout.

  2) The wait/reinit ordering in bes2600_pwr_enter_lp_mode is

         status = wait_for_completion_timeout(...);
         atomic_set(pm_set_in_process, 0);
         reinit_completion(...);

     If an indication arrives between wait_for_completion_timeout
     returning with status==1 and reinit_completion, the next
     enter_lp_mode iteration's wait can also see false success. The
     reinit must happen *before* we start the new request, not
     after handling the previous one.

  3) On wait_pm_ind timeout, the driver returns -ETIMEDOUT and walks
     away. It does not record that the firmware's actual PM state
     is no longer known to the host. Subsequent wake paths
     (gpio_wake / sbus_active) assume the chip is still active and
     hit deterministic SDIO failures when the firmware has
     transitioned anyway.

This patch is the safe-prerequisite half of a wider fix:

  * bes_pwr.h gains enum bes2600_chip_pm_state {ACTIVE, LP, UNKNOWN}
    and bes_power.chip_pm_state. Its job is to track what the host
    has *seen the firmware confirm*, not what the host has
    requested. Initialised to ACTIVE in bes2600_pwr_init().

  * bes2600_pwr_notify_ps_changed() unconditionally updates
    chip_pm_state on every indication, but only fires
    complete(pm_enter_cmpl) when atomic_cmpxchg(pm_set_in_process,
    1, 0) succeeds. A spontaneous indication can no longer prime a
    waiter that will only set up its request afterwards.

  * bes2600_pwr_enter_lp_mode() now reinit_completion()s before
    setting pm_set_in_process and sending wsm_set_pm. After a
    timeout, it cmpxchgs pm_set_in_process back to 0 (so a late
    indication cannot prime the next iteration) and on the win-
    cmpxchg branch records chip_pm_state=UNKNOWN.

A follow-up patch consumes chip_pm_state on the wake side
(bes2600_pwr_device_exit_lp_mode + bes2600_gpio_wakeup_mcu) to fix
the deterministic "active mcu fail" cycle this state-record
enables a fix for. Splitting the work this way keeps the lock-free
race fix small and reviewable on its own.

No new locks, no behaviour change on the success path. Only the
recovery path (timeout + spontaneous indication) gains correctness.

Signed-off-by: Markus Fritsche <fritsche.markus@gmail.com>
2026-04-28 16:11:08 +02:00
2 changed files with 100 additions and 9 deletions
+81 -5
View File
@@ -524,7 +524,17 @@ static int bes2600_pwr_enter_lp_mode(struct bes2600_common *hw_priv)
bes_devel("%s, psMode:%s, fastPsmIdlePeriod:%d apPsmChangePeriod:%d minAutoPsPollPeriod:%d\n",
__func__, bes2600_get_ps_mode_str(priv->powersave_mode.pmMode), priv->powersave_mode.fastPsmIdlePeriod,
priv->powersave_mode.apPsmChangePeriod, priv->powersave_mode.minAutoPsPollPeriod);
/*
* Reinit BEFORE the WSM goes out, so a stale
* indication from a previous cycle cannot have
* primed pm_enter_cmpl. From here until the
* indication callback's cmpxchg(1->0) on
* pm_set_in_process, only the indication for
* THIS request can complete the wait.
*/
reinit_completion(&hw_priv->bes_power.pm_enter_cmpl);
atomic_set(&hw_priv->bes_power.pm_set_in_process, 1);
ret = bes2600_set_pm(priv, &priv->powersave_mode);
if (ret) {
atomic_set(&hw_priv->bes_power.pm_set_in_process, 0);
@@ -535,12 +545,34 @@ static int bes2600_pwr_enter_lp_mode(struct bes2600_common *hw_priv)
/* wait power save mode changed indication */
status = wait_for_completion_timeout(&hw_priv->bes_power.pm_enter_cmpl, 5 * HZ);
atomic_set(&hw_priv->bes_power.pm_set_in_process, 0);
reinit_completion(&hw_priv->bes_power.pm_enter_cmpl);
if (!status) {
/*
* The indication callback only fires
* complete() when it observes
* pm_set_in_process == 1; cmpxchg it
* to 0 here so a late indication
* cannot prime the next wait.
*
* If we win the cmpxchg, this is a
* real timeout: the firmware's PS
* state is unknown to us. Mark it as
* such so the next wake path can
* probe before assuming the chip is
* still active.
*
* If we lose the cmpxchg, the
* indication arrived between the
* wait timing out and us getting
* here; treat as success.
*/
if (atomic_cmpxchg(&hw_priv->bes_power.pm_set_in_process,
1, 0) == 1) {
bes_devel("%s, wait pm ind timeout\n", __func__);
atomic_set(&hw_priv->bes_power.chip_pm_state,
BES2600_CHIP_PM_UNKNOWN);
timeouts++;
}
}
} else {
bes_devel("skip enter lp mode\n");
}
@@ -554,10 +586,34 @@ static int bes2600_pwr_enter_lp_mode(struct bes2600_common *hw_priv)
* in an inconsistent state that cascades into SDIO TX errors on
* the BES2600.
*/
if (timeouts == 0)
if (timeouts == 0) {
bes2600_pwr_device_enter_lp_mode(hw_priv);
else
} else {
/*
* device_enter_lp_mode() was skipped (one or more VIFs
* timed out waiting for the firmware indication) so its
* gpio_sleep(MCU) - which drops the wake-flag bit and, if
* no other subsystem holds the wake, drives the GPIO low -
* never ran. Without it the bit stays asserted, and the
* next bes2600_pwr_device_exit_lp_mode() calls
* gpio_wake(MCU) into a "bit already set" no-op: the GPIO
* never re-edges, sbus_active() exhausts its 200x2ms
* MCU_WAKEUP_READY budget against an unwoken chip, and
* the first TX after idle stalls for several seconds.
*
* Drop the MCU wake-flag bit explicitly here so the next
* wake injects a real GPIO edge. gpio_allow_mcu_sleep
* preserves multi-subsystem semantics: it only drives the
* GPIO low when no other subsystem still holds wake; if
* BT or another holder is keeping the chip awake, the
* GPIO stays high and the bit clear here is purely
* bookkeeping (so the next gpio_wake doesn't no-op).
*/
if (hw_priv->sbus_ops->gpio_sleep)
hw_priv->sbus_ops->gpio_sleep(hw_priv->sbus_priv,
GPIO_WAKE_FLAG_MCU);
ret = -ETIMEDOUT;
}
return ret;
}
@@ -833,6 +889,7 @@ void bes2600_pwr_init(struct bes2600_common *hw_priv)
hw_priv->bes_power.power_up_task = NULL;
mutex_init(&hw_priv->bes_power.pwr_mutex);
atomic_set(&hw_priv->bes_power.dev_state, 0);
atomic_set(&hw_priv->bes_power.chip_pm_state, BES2600_CHIP_PM_UNKNOWN);
init_completion(&hw_priv->bes_power.pm_enter_cmpl);
sema_init(&hw_priv->bes_power.sync_lock, 1);
device_set_wakeup_capable(hw_priv->pdev, true);
@@ -1213,9 +1270,28 @@ int bes2600_pwr_clear_busy_event(struct bes2600_common *hw_priv, u32 event)
void bes2600_pwr_notify_ps_changed(struct bes2600_common *hw_priv, u8 psmode)
{
if((psmode & 0x01) != WSM_PSM_ACTIVE) {
/*
* The firmware sends a PM-changed indication for every transition,
* including ones we didn't ask for (firmware-internal coex moves,
* idle-driven aging). Update chip_pm_state unconditionally so the
* wake path can use it, but only fire pm_enter_cmpl when a host-
* initiated set_pm is actually in flight - otherwise a stale
* indication can prime a future wait against a freshly
* reinit_completion()'ed state.
*/
if ((psmode & 0x01) != WSM_PSM_ACTIVE) {
atomic_set(&hw_priv->bes_power.chip_pm_state,
BES2600_CHIP_PM_LP);
if (atomic_cmpxchg(&hw_priv->bes_power.pm_set_in_process,
1, 0) == 1) {
bes_devel("complete pm_enter_cmpl\n");
complete(&hw_priv->bes_power.pm_enter_cmpl);
} else {
bes_devel("PM ind (LP) without pending wait; state recorded\n");
}
} else {
atomic_set(&hw_priv->bes_power.chip_pm_state,
BES2600_CHIP_PM_ACTIVE);
}
}
+15
View File
@@ -64,6 +64,20 @@ enum power_down_state
POWER_DOWN_STATE_UNLOCKED,
};
/*
* Confirmed PM state of the firmware-side chip. Tracks what the host
* has *seen* the firmware acknowledge, not what the host has
* requested. UNKNOWN means a host-initiated transition timed out
* before the firmware indication arrived; the next wake path should
* treat it as "we don't know" and probe before issuing GPIO/SDIO
* wakeup ops.
*/
enum bes2600_chip_pm_state {
BES2600_CHIP_PM_ACTIVE = 0,
BES2600_CHIP_PM_LP,
BES2600_CHIP_PM_UNKNOWN,
};
typedef void (*bes_pwr_enter_lp_cb)(struct bes2600_common *hw_priv);
typedef void (*bes_pwr_exit_lp_cb)(struct bes2600_common *hw_priv);
@@ -106,6 +120,7 @@ struct bes2600_pwr_t
bool ap_lp_bad;
struct bes2600_pwr_event_t pwr_events[BES2600_DELAY_EVENT_NUM];
atomic_t pm_set_in_process;
atomic_t chip_pm_state;
};
#ifdef CONFIG_BES2600_WOWLAN