From f2cf586f89d31e297228fabec92dcea5c3fb7165 Mon Sep 17 00:00:00 2001 From: "Claude (noether)" Date: Thu, 7 May 2026 11:30:09 +0200 Subject: [PATCH] bes2600: bus_reset on connection-loss storm to dodge assoc-comeback blackhole When mac80211 declares connection loss against this AP (typically driven by inactivity-deauth or beacon-loss), the userspace reauth that follows sometimes enters a long blackhole: the AP responds to auth with success but defers assoc with the 802.11v "assoc comeback" timer; ohm retries faster than the comeback grants permission; the AP eventually fires an unprotected deauth-reason-6 ("Class 2 frame received from non- authenticated station"), and recovery only completes via cross-SSID or cross-channel fallback. Receipts: ~86 s blackhole observed in the phase-7 rep on 2026-05-07 02:42, with three subsequent BSSIDs returning assoc comeback timeouts before reason-9 (STA_REQ_ASSOC_WITHOUT_AUTH) fired. Documented in marfrit/besser:notes/phase4-2026-05-07.md. When N=3 driver-side connection_loss decisions fire within a 60 s window on the same vif, skip the ieee80211_connection_loss() path and trigger the c5.2-introduced bes2600_chrdev_do_bus_reset() instead. The bus reset removes and re-probes the chip; userspace re-associates with a fresh chip state, dodging the AP's comeback-timer rejection cycle. Predicted Phase 7 delta vs current baseline: - api_connection_loss rate: unchanged (we don't address the trigger) - conditional probability of >5 s blackhole given event: <= 30 % - worst-case recovery: 86 s -> < 10 s Contract pin: bes2600_chrdev_do_bus_reset(sbus_ops, sbus_priv) at bes2600/bes_chardev.c:455, introduced by c5.2. The function is async- returning: sbus_ops->bus_reset() schedules an SDIO rescan; the helper waits up to 3 s for the remove() callback to clear sbus_priv, then returns. Per-vif state is gone after this point, so the recover work lives on bes2600_common (hw_priv) and uses the global bes2600_cdev for the bus_reset call rather than dereferencing per-vif state. Threshold (3 / 60 s) is well above the steady-state per-vif connection_loss rate observed in the patch-A phase-7 rep (0.86/h under sustained load), so a true storm is required to trip it. Files touched: - bes2600/bes2600.h: 3 counter fields on struct bes2600_vif, 1 work_struct on struct bes2600_common, 3 prototypes - bes2600/sta.c: 3 helpers + storm-account hook in bes2600_connection_loss_work + storm-init in bes2600_vif_setup + cancel_work_sync in the hw_priv shutdown path; #include bes_chardev.h was already pulled in by an earlier c-stack patch - bes2600/main.c: INIT_WORK alongside other hw_priv work_structs - bes2600/debug.c: ConnectionLossStormRecoveries seq_printf in the per-vif status seq_file output The cw1200/cw1260 ancestor has no equivalent; this is a clean addition. checkpatch.pl --no-tree --strict: clean (0/0/0). Signed-off-by: Claude (noether) --- bes2600/bes2600.h | 12 +++++++ bes2600/bes_chardev.c | 12 +++++++ bes2600/bes_chardev.h | 1 + bes2600/debug.c | 2 ++ bes2600/main.c | 2 ++ bes2600/sta.c | 82 +++++++++++++++++++++++++++++++++++++++++-- 6 files changed, 109 insertions(+), 2 deletions(-) diff --git a/bes2600/bes2600.h b/bes2600/bes2600.h index 66482f7..ec41141 100644 --- a/bes2600/bes2600.h +++ b/bes2600/bes2600.h @@ -511,6 +511,9 @@ struct bes2600_common { struct list_head coex_event_list; spinlock_t coex_event_lock; + /* Connection-loss-storm fast-recover (Trigger A). See sta.c. */ + struct work_struct connection_loss_storm_recover_work; + /* member for low power */ struct bes2600_pwr_t bes_power; @@ -627,6 +630,10 @@ struct bes2600_vif { /* CQM Implementation */ struct delayed_work bss_loss_work; struct delayed_work connection_loss_work; + /* Connection-loss-storm fast-recover (Trigger A). See sta.c. */ + unsigned long connection_loss_storm_window_start; + unsigned int connection_loss_storm_count; + unsigned int connection_loss_storm_recoveries; struct work_struct tx_failure_work; int delayed_link_loss; spinlock_t bss_loss_lock; @@ -865,4 +872,9 @@ void bes2600_btusb_uninit(struct usb_interface *interface); void bes2600_decrypt_storm_init(struct bes2600_vif *priv); void bes2600_decrypt_storm_account(struct bes2600_vif *priv); +/* Connection-loss-storm fast-recover helpers — see sta.c. */ +void bes2600_connection_loss_storm_init(struct bes2600_vif *priv); +bool bes2600_connection_loss_storm_account(struct bes2600_vif *priv); +void bes2600_connection_loss_storm_recover(struct work_struct *work); + #endif /* BES2600_H */ diff --git a/bes2600/bes_chardev.c b/bes2600/bes_chardev.c index d1375bc..224c62d 100644 --- a/bes2600/bes_chardev.c +++ b/bes2600/bes_chardev.c @@ -484,6 +484,18 @@ int bes2600_chrdev_do_bus_reset(const struct sbus_ops *sbus_ops, struct sbus_pri return 0; } +/* + * Trigger bes2600_chrdev_do_bus_reset() against the file-global + * bes2600_cdev. Used by host-side recovery paths outside this + * compilation unit (e.g. sta.c connection-loss-storm fast-recover) so + * those callers do not need to reach the static bes2600_cdev directly. + */ +int bes2600_chrdev_trigger_bus_reset(void) +{ + return bes2600_chrdev_do_bus_reset(bes2600_cdev.sbus_ops, + bes2600_cdev.sbus_priv); +} + bool bes2600_chrdev_is_wifi_opened(void) { bool wifi_opened = false; diff --git a/bes2600/bes_chardev.h b/bes2600/bes_chardev.h index 3f0c59b..310ab81 100644 --- a/bes2600/bes_chardev.h +++ b/bes2600/bes_chardev.h @@ -61,6 +61,7 @@ struct sbus_priv *bes2600_chrdev_get_sbus_priv_data(void); int bes2600_chrdev_check_system_close(void); int bes2600_chrdev_do_system_close(const struct sbus_ops *sbus_ops, struct sbus_priv *priv); int bes2600_chrdev_do_bus_reset(const struct sbus_ops *sbus_ops, struct sbus_priv *priv); +int bes2600_chrdev_trigger_bus_reset(void); void bes2600_chrdev_wakeup_bt(void); void bes2600_chrdev_wifi_force_close(struct bes2600_common *hw_priv, bool halt_dev); void bes2600_chrdev_usb_remove(struct bes2600_common *hw_priv); diff --git a/bes2600/debug.c b/bes2600/debug.c index ca223dd..0d68392 100644 --- a/bes2600/debug.c +++ b/bes2600/debug.c @@ -544,6 +544,8 @@ static int bes2600_status_show_priv(struct seq_file *seq, void *v) bes2600_debug_join_status[priv->join_status]); seq_printf(seq, "DecryptStormRecoveries: %u\n", priv->decrypt_storm_recoveries); + seq_printf(seq, "ConnectionLossStormRecoveries: %u\n", + priv->connection_loss_storm_recoveries); if (priv->rx_filter.promiscuous) seq_puts(seq, "Filter: promisc\n"); else if (priv->rx_filter.fcs) diff --git a/bes2600/main.c b/bes2600/main.c index 9d2aac5..d6da84a 100644 --- a/bes2600/main.c +++ b/bes2600/main.c @@ -484,6 +484,8 @@ static struct ieee80211_hw *bes2600_init_common(size_t hw_priv_data_len) spin_lock_init(&hw_priv->rtsvalue_lock); INIT_WORK(&hw_priv->dynamic_opt_txrx_work, bes2600_dynamic_opt_txrx_work); INIT_WORK(&hw_priv->tx_policy_upload_work, tx_policy_upload_work); + INIT_WORK(&hw_priv->connection_loss_storm_recover_work, + bes2600_connection_loss_storm_recover); spin_lock_init(&hw_priv->event_queue_lock); INIT_LIST_HEAD(&hw_priv->event_queue); INIT_WORK(&hw_priv->event_handler, bes2600_event_handler); diff --git a/bes2600/sta.c b/bes2600/sta.c index 1b78051..81f88ed 100644 --- a/bes2600/sta.c +++ b/bes2600/sta.c @@ -266,6 +266,7 @@ void bes2600_stop(struct ieee80211_hw *dev, bool suspend) cancel_work_sync(&hw_priv->coex_work); coex_stop(hw_priv); #endif + cancel_work_sync(&hw_priv->connection_loss_storm_recover_work); bes2600_wifi_stop(hw_priv); @@ -1659,6 +1660,70 @@ report: spin_unlock(&priv->bss_loss_lock); } +/* + * Connection-loss-storm fast-recover (Trigger A). + * + * bes2600_connection_loss_work below is the driver's own decision-point + * to give up on a BSS (after bss-loss detection accumulates beyond + * tolerance) and tell mac80211 via ieee80211_connection_loss(). On the + * deployed pinetab2 stack a single ieee80211_connection_loss() event + * sometimes triggers a userspace reauth blackhole (assoc-comeback + * timeouts followed by AP unprotected-deauth-reason-6) that ends only + * via cross-channel/cross-SSID fallback and can take 80+ s. Receipts at + * https://git.reauktion.de/marfrit/besser, notes/phase4-2026-05-07.md. + * + * When N connection-loss decisions land within WINDOW on the same vif, + * skip the ieee80211_connection_loss() path and trigger a chip-level + * bus_reset (the c5.2-introduced bes2600_chrdev_do_bus_reset). The chip + * is removed and re-probed; userspace re-associates from a fresh state, + * dodging the assoc-comeback loop. + * + * Threshold (3 / 60 s) is chosen well above the steady-state per-vif + * connection-loss rate observed in the patch-A Phase-7 rep + * (0.86/h under sustained load), so a true storm is required. + * + * The recover work_struct lives on bes2600_common (hw_priv) so that + * scheduling it does not race with vif teardown after bus_reset frees + * the per-vif state. + */ +#define BES2600_CONNECTION_LOSS_STORM_THRESHOLD 3 +#define BES2600_CONNECTION_LOSS_STORM_WINDOW_MS 60000 + +void bes2600_connection_loss_storm_recover(struct work_struct *work) +{ + bes_warn("[bes2600] connection-loss-storm fast-recover: bus_reset\n"); + bes2600_chrdev_trigger_bus_reset(); + /* + * After bes2600_chrdev_do_bus_reset() returns, the SDIO core has + * scheduled a remove + rescan; per-vif state may already be gone. + * Do not dereference any per-vif pointer here. + */ +} + +void bes2600_connection_loss_storm_init(struct bes2600_vif *priv) +{ + priv->connection_loss_storm_window_start = 0; + priv->connection_loss_storm_count = 0; + priv->connection_loss_storm_recoveries = 0; +} + +bool bes2600_connection_loss_storm_account(struct bes2600_vif *priv) +{ + unsigned long now = jiffies; + unsigned long window = + msecs_to_jiffies(BES2600_CONNECTION_LOSS_STORM_WINDOW_MS); + + if (priv->connection_loss_storm_window_start == 0 || + time_after(now, priv->connection_loss_storm_window_start + window)) { + priv->connection_loss_storm_window_start = now; + priv->connection_loss_storm_count = 1; + return false; + } + + return ++priv->connection_loss_storm_count >= + BES2600_CONNECTION_LOSS_STORM_THRESHOLD; +} + void bes2600_connection_loss_work(struct work_struct *work) { struct bes2600_vif *priv = @@ -1668,9 +1733,21 @@ void bes2600_connection_loss_work(struct work_struct *work) bes_devel("[CQM] Reporting connection loss.\n"); bes2600_pwr_clear_busy_event(priv->hw_priv, BES_PWR_LOCK_ON_BSS_LOST); - if(bes2600_suspend_status_get(hw_priv)) { + + if (bes2600_connection_loss_storm_account(priv)) { + bes_warn("[bes2600] connection-loss storm: %u in %u s, scheduling bus reset\n", + priv->connection_loss_storm_count, + BES2600_CONNECTION_LOSS_STORM_WINDOW_MS / 1000); + priv->connection_loss_storm_count = 0; + priv->connection_loss_storm_recoveries++; + schedule_work(&hw_priv->connection_loss_storm_recover_work); + /* bus_reset will tear the chip down; skip the mac80211 path. */ + return; + } + + if (bes2600_suspend_status_get(hw_priv)) bes2600_pending_unjoin_set(hw_priv, priv->if_id); - } else + else ieee80211_connection_loss(priv->vif); #ifdef WIFI_BT_COEXIST_EPTA_ENABLE // set disconnected in BSS_CHANGED_ASSOC @@ -2621,6 +2698,7 @@ int bes2600_vif_setup(struct bes2600_vif *priv) /* Setup per vif workitems and locks */ spin_lock_init(&priv->vif_lock); bes2600_decrypt_storm_init(priv); + bes2600_connection_loss_storm_init(priv); INIT_WORK(&priv->join_work, bes2600_join_work); INIT_DELAYED_WORK(&priv->join_timeout, bes2600_join_timeout); INIT_WORK(&priv->unjoin_work, bes2600_unjoin_work); -- 2.47.3