askrene: fix infinite loop if refine_flows() cuts down our last flow with 1 remaining before maxparts.

1. We would find a flow. 2. refine_flow would reduce it so it doesn't deliver enough. 3. So we need to find another, but we are at the limit. 4. So we remove the flow we found. 5. Goto 1. This can be fixed by disabling a channel which we caused us to reduce the flow, so we should always make forward progress. Signed-off-by: Rusty Russell <rusty@rustcorp.com.au> Changelog-Fixed: Plugins: `askrene` could enter an infinite loop when maxparts is restricted.
2025-10-31 10:05:09 +10:30
parent 6b480c74f8
commit 03d38b612d
4 changed files with 38 additions and 20 deletions
--- a/plugins/askrene/mcf.c
+++ b/plugins/askrene/mcf.c
@@ -1377,6 +1377,7 @@ linear_routes(const tal_t *ctx, struct route_query *rq,

 	while (!amount_msat_is_zero(amount_to_deliver)) {
 		size_t num_parts, parts_slots, excess_parts;
+		u32 bottleneck_idx;

                /* FIXME: This algorithm to limit the number of parts is dumb
                 * for two reasons:
@@ -1424,7 +1425,7 @@ linear_routes(const tal_t *ctx, struct route_query *rq,
 		}

 		error_message =
-		    refine_flows(ctx, rq, amount_to_deliver, &new_flows);
+			refine_flows(ctx, rq, amount_to_deliver, &new_flows, &bottleneck_idx);
 		if (error_message)
 			goto fail;

@@ -1459,14 +1460,19 @@ linear_routes(const tal_t *ctx, struct route_query *rq,
 			excess_parts = 1;
 		} else
 			excess_parts = 0;
-		if (excess_parts > 0 &&
-		    !remove_flows(&new_flows, excess_parts)) {
-			error_message = rq_log(ctx, rq, LOG_BROKEN,
-					       "%s: failed to remove %zu"
-					       " flows from a set of %zu",
-					       __func__, excess_parts,
-					       tal_count(new_flows));
-			goto fail;
+		if (excess_parts > 0) {
+			/* If we removed all the flows we found, avoid selecting them again,
+			 * by disabling one. */
+			if (excess_parts == tal_count(new_flows))
+				bitmap_set_bit(rq->disabled_chans, bottleneck_idx);
+			if (!remove_flows(&new_flows, excess_parts)) {
+				error_message = rq_log(ctx, rq, LOG_BROKEN,
+						       "%s: failed to remove %zu"
+						       " flows from a set of %zu",
+						       __func__, excess_parts,
+						       tal_count(new_flows));
+				goto fail;
+			}
 		}

 		/* Is this set of flows too expensive?
--- a/plugins/askrene/refine.c
+++ b/plugins/askrene/refine.c
@@ -235,16 +235,25 @@ static int revcmp_flows(const size_t *a, const size_t *b, struct flow **flows)
 //      -> check that htlc_max are all satisfied
 //      -> check that (x+1) at least one htlc_max is violated
 /* Given the channel constraints, return the maximum amount that can be
- * delivered. */
-static struct amount_msat path_max_deliverable(struct channel_data *path)
+ * delivered.  Sets *bottleneck_idx to one of the contraining channels' idx, if non-NULL */
+static struct amount_msat path_max_deliverable(struct channel_data *path,
+					       u32 *bottleneck_idx)
 {
 	struct amount_msat deliver = AMOUNT_MSAT(-1);
 	for (size_t i = 0; i < tal_count(path); i++) {
 		deliver =
 		    amount_msat_sub_fee(deliver, path[i].fee_base_msat,
 					path[i].fee_proportional_millionths);
-		deliver = amount_msat_min(deliver, path[i].htlc_max);
-		deliver = amount_msat_min(deliver, path[i].liquidity_max);
+		if (amount_msat_greater(deliver, path[i].htlc_max)) {
+			if (bottleneck_idx)
+				*bottleneck_idx = path[i].idx;
+			deliver = path[i].htlc_max;
+		}
+		if (amount_msat_greater(deliver, path[i].liquidity_max)) {
+			if (bottleneck_idx)
+				*bottleneck_idx = path[i].idx;
+			deliver = path[i].liquidity_max;
+		}
 	}
 	return deliver;
 }
@@ -477,9 +486,9 @@ static void write_selected_flows(const tal_t *ctx, size_t *flows_index,
 	tal_free(tmp_flows);
 }

-/* FIXME: on failure return error message */
 const char *refine_flows(const tal_t *ctx, struct route_query *rq,
-			 struct amount_msat deliver, struct flow ***flows)
+			 struct amount_msat deliver, struct flow ***flows,
+			 u32 *bottleneck_idx)
 {
 	const tal_t *working_ctx = tal(ctx, tal_t);
 	const char *error_message = NULL;
@@ -499,7 +508,7 @@ const char *refine_flows(const tal_t *ctx, struct route_query *rq,
 	for (size_t i = 0; i < tal_count(channel_mpp_cache); i++) {
 		// FIXME: does path_max_deliverable work for a single
 		// channel with 0 fees?
-		max_deliverable[i] = path_max_deliverable(channel_mpp_cache[i]);
+		max_deliverable[i] = path_max_deliverable(channel_mpp_cache[i], bottleneck_idx);
 		min_deliverable[i] = path_min_deliverable(channel_mpp_cache[i]);
 		/* We use an array of indexes to keep track of the order
 		 * of the flows. Likewise flows can be removed by simply
@@ -578,7 +587,7 @@ void squash_flows(const tal_t *ctx, struct route_query *rq,
 		struct short_channel_id_dir scidd;
 		flows_index[i] = i;
 		paths_str[i] = tal_strdup(working_ctx, "");
-		max_deliverable[i] = path_max_deliverable(channel_mpp_cache[i]);
+		max_deliverable[i] = path_max_deliverable(channel_mpp_cache[i], NULL);

 		for (size_t j = 0; j < tal_count(flow->path); j++) {
 			scidd.scid =
--- a/plugins/askrene/refine.h
+++ b/plugins/askrene/refine.h
@@ -22,9 +22,13 @@ bool create_flow_reservations_verify(const struct route_query *rq,
 				     const struct flow *flow);

 /* Modify flows to meet HTLC min/max requirements.
- * It takes into account the exact value of the fees expected at each hop. */
+ * It takes into account the exact value of the fees expected at each hop.
+ * If we reduce flows because it's too large for one channel, *bottleneck_idx
+ * is set to the idx of a channel which caused a reduction (if non-NULL).
+ */
 const char *refine_flows(const tal_t *ctx, struct route_query *rq,
-			 struct amount_msat deliver, struct flow ***flows);
+			 struct amount_msat deliver, struct flow ***flows,
+			 u32 *bottleneck_idx);

 /* Duplicated flows are merged into one. This saves in base fee and HTLC fees.
 */
--- a/tests/test_askrene.py
+++ b/tests/test_askrene.py
@@ -1538,7 +1538,6 @@ def test_simple_dummy_channel(node_factory):
    )


-@pytest.mark.xfail(strict=True)
 def test_maxparts_infloop(node_factory, bitcoind):
    # Three paths from l1 -> l5.
    # FIXME: enhance explain_failure!