From 8c7ac33f882a9406d7e413d755c50669bfb61b77 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Sun, 16 Nov 2025 15:36:13 +1030
Subject: [PATCH] askrene: implement reduce_num_flows in refine, using
 increase_flows().

Now we simply call it at the end.  We need to check it hasn't violated fee maxima, but
otherwise it's simple.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Changelog-Fixed: Plugins: `askrene` now handles limits on number of htlcs much more gracefully.
---
 plugins/askrene/mcf.c    | 79 +++++++++++++++-------------------------
 plugins/askrene/refine.c | 61 +++++++++++++++++++------------
 plugins/askrene/refine.h |  9 +++--
 tests/test_askrene.py    | 21 +++++------
 4 files changed, 83 insertions(+), 87 deletions(-)

diff --git a/plugins/askrene/mcf.c b/plugins/askrene/mcf.c
index 44cdfc728..c04f72022 100644
--- a/plugins/askrene/mcf.c
+++ b/plugins/askrene/mcf.c
@@ -1377,9 +1377,6 @@ linear_routes(const tal_t *ctx, struct route_query *rq,
 	struct reserve_hop *reservations = new_reservations(working_ctx, rq);
 
 	while (!amount_msat_is_zero(amount_to_deliver)) {
-		size_t num_parts, parts_slots, excess_parts;
-		u32 bottleneck_idx;
-
 		if (timemono_after(time_mono(), deadline)) {
 			error_message = rq_log(ctx, rq, LOG_BROKEN,
 					       "%s: timed out after deadline",
@@ -1387,24 +1384,7 @@ linear_routes(const tal_t *ctx, struct route_query *rq,
 			goto fail;
 		}
 
-                /* FIXME: This algorithm to limit the number of parts is dumb
-                 * for two reasons:
-                 *      1. it does not take into account that several loop
-                 *      iterations here may produce two flows along the same
-                 *      path that after "squash_flows" become a single flow.
-                 *      2. limiting the number of "slots" to 1 makes us fail to
-                 *      see some solutions that use more than one of those
-                 *      existing paths.
-                 *
-                 * A better approach could be to run MCF, remove the excess
-                 * paths and then recompute a MCF on a network where each arc is
-                 * one of the previously remaining paths, ie. redistributing the
-                 * payment amount among the selected paths in a cost-efficient
-                 * way. */
 		new_flows = tal_free(new_flows);
-		num_parts = tal_count(*flows);
-		assert(num_parts < rq->maxparts);
-		parts_slots = rq->maxparts - num_parts;
 
 		/* If the amount_to_deliver is very small we better use a single
 		 * path computation because:
@@ -1415,8 +1395,7 @@ linear_routes(const tal_t *ctx, struct route_query *rq,
                  * do not deliver the entire payment amount by just a small
                  * amount. */
 		if (amount_msat_less_eq(amount_to_deliver,
-					SINGLE_PATH_THRESHOLD) ||
-		    parts_slots == 1) {
+					SINGLE_PATH_THRESHOLD)) {
 			new_flows = single_path_flow(working_ctx, rq, srcnode,
 						     dstnode, amount_to_deliver,
 						     mu, delay_feefactor);
@@ -1433,7 +1412,7 @@ linear_routes(const tal_t *ctx, struct route_query *rq,
 		}
 
 		error_message =
-			refine_flows(ctx, rq, amount_to_deliver, &new_flows, &bottleneck_idx);
+			refine_flows(ctx, rq, amount_to_deliver, &new_flows, NULL);
 		if (error_message)
 			goto fail;
 
@@ -1457,32 +1436,6 @@ linear_routes(const tal_t *ctx, struct route_query *rq,
 			assert(!amount_msat_is_zero(new_flows[i]->delivers));
 		}
 
-		if (tal_count(new_flows) > parts_slots) {
-			/* Remove the excees of parts and leave one slot for the
-			 * next round of computations. */
-			excess_parts = 1 + tal_count(new_flows) - parts_slots;
-		} else if (tal_count(new_flows) == parts_slots &&
-			   amount_msat_less(all_deliver, amount_to_deliver)) {
-			/* Leave exactly 1 slot for the next round of
-			 * computations. */
-			excess_parts = 1;
-		} else
-			excess_parts = 0;
-		if (excess_parts > 0) {
-			/* If we removed all the flows we found, avoid selecting them again,
-			 * by disabling one. */
-			if (excess_parts == tal_count(new_flows))
-				bitmap_set_bit(rq->disabled_chans, bottleneck_idx);
-			if (!remove_flows(&new_flows, excess_parts)) {
-				error_message = rq_log(ctx, rq, LOG_BROKEN,
-						       "%s: failed to remove %zu"
-						       " flows from a set of %zu",
-						       __func__, excess_parts,
-						       tal_count(new_flows));
-				goto fail;
-			}
-		}
-
 		/* Is this set of flows too expensive?
 		 * We can check if the new flows are within the fee budget,
 		 * however in some cases we have discarded some flows at this
@@ -1606,6 +1559,34 @@ linear_routes(const tal_t *ctx, struct route_query *rq,
 	/* all set! Now squash flows that use the same path */
 	squash_flows(ctx, rq, flows);
 
+	/* If we're over the number of parts, try to cram excess into the
+	 * largest-capacity parts */
+	if (tal_count(*flows) > rq->maxparts) {
+		struct amount_msat fee;
+
+		error_message = reduce_num_flows(rq, rq, flows, amount, rq->maxparts);
+		if (error_message) {
+			*flows = tal_free(*flows);
+			return error_message;
+		}
+
+		/* Check fee budget! */
+		fee = flowset_fee(rq->plugin, *flows);
+		if (amount_msat_greater(fee, maxfee)) {
+			error_message = rq_log(rq, rq, LOG_INFORM,
+					       "After reducing the flows to %zu (i.e. maxparts),"
+					       " we had a fee of %s, greater than "
+					       "max of %s.",
+					       tal_count(*flows),
+					       fmt_amount_msat(tmpctx, fee),
+					       fmt_amount_msat(tmpctx, maxfee));
+			if (error_message) {
+				*flows = tal_free(*flows);
+				return error_message;
+			}
+		}
+	}
+
 	/* flows_probability re-does a temporary reservation so we need to call
 	 * it after we have cleaned the reservations we used to build the flows
 	 * hence after we freed working_ctx. */
diff --git a/plugins/askrene/refine.c b/plugins/askrene/refine.c
index 052d125bf..2babc216a 100644
--- a/plugins/askrene/refine.c
+++ b/plugins/askrene/refine.c
@@ -1,5 +1,6 @@
 #include "config.h"
 #include <ccan/asort/asort.h>
+#include <ccan/cast/cast.h>
 #include <ccan/tal/str/str.h>
 #include <common/gossmap.h>
 #include <plugins/askrene/askrene.h>
@@ -597,29 +598,43 @@ double flows_probability(const tal_t *ctx, struct route_query *rq,
 	return probability;
 }
 
-/* Compare flows by deliver amount */
-static int reverse_cmp_flows(struct flow *const *fa, struct flow *const *fb,
-			     void *unused UNUSED)
+const char *reduce_num_flows(const tal_t *ctx,
+			     const struct route_query *rq,
+			     struct flow ***flows,
+			     struct amount_msat deliver,
+			     size_t num_parts)
 {
-	if (amount_msat_eq((*fa)->delivers, (*fb)->delivers))
-		return 0;
-	if (amount_msat_greater((*fa)->delivers, (*fb)->delivers))
-		return -1;
-	return 1;
-}
+	/* Keep the largest flows (not as I originally implemented, the largest
+	 * capacity flows).  Here's Lagrang3's analysis:
+	 *
+	 * I think it is better to keep the largest-deliver flows.  If we only
+	 * go for the highest capacity we may throw away the low cost benefits
+	 * of the MCF.
 
-bool remove_flows(struct flow ***flows, u32 n)
-{
-	if (n == 0)
-		goto fail;
-	if (n > tal_count(*flows))
-		goto fail;
-	asort(*flows, tal_count(*flows), reverse_cmp_flows, NULL);
-	for (size_t count = tal_count(*flows); n > 0; n--, count--) {
-		assert(count > 0);
-		tal_arr_remove(flows, count - 1);
-	}
-	return true;
-fail:
-	return false;
+	 * Hypothetical scenario: MCF finds 3 flows but maxparts=2,
+	 * flow 1: deliver=10, cost=0, capacity=0
+	 * flow 2: deliver=7, cost=1, capacity=5
+	 * flow 3: deliver=1, cost=10, capacity=100
+	 *
+	 * It is better to keep flows 1 and 2 by accomodating 1 more unit of
+	 * flow in flow2 at 1 value expense (per flow), than to keep flows 2 and
+	 * 3 by accomodating 5 more units of flow in flow2 at cost 1 and 5 in
+	 * flow3 at cost 100.
+	 *
+	 * The trade-off is: if we prioritize the delivery value already
+	 * computed by MCF then we find better solutions, but we might fail to
+	 * find feasible solutions sometimes. If we prioritize capacity then we
+	 * generally find bad solutions though we find feasibility more often
+	 * than the alternative.
+	 */
+	size_t orig_num_flows = tal_count(*flows);
+	asort(*flows, orig_num_flows, revcmp_flows, NULL);
+	tal_resize(flows, num_parts);
+
+	if (!increase_flows(rq, *flows, deliver, -1.0))
+		return rq_log(ctx, rq, LOG_INFORM,
+			      "Failed to reduce %zu flows down to maxparts (%zu)",
+			      orig_num_flows, num_parts);
+
+	return NULL;
 }
diff --git a/plugins/askrene/refine.h b/plugins/askrene/refine.h
index fcd64be21..eeed8f28e 100644
--- a/plugins/askrene/refine.h
+++ b/plugins/askrene/refine.h
@@ -38,7 +38,10 @@ void squash_flows(const tal_t *ctx, struct route_query *rq,
 double flows_probability(const tal_t *ctx, struct route_query *rq,
 			 struct flow ***flows);
 
-/* Helper function: removes n flows from the set. It will remove those flows
- * with the lowest amount values. */
-bool remove_flows(struct flow ***flows, u32 n);
+/* Modify flows so only N remain, if we can.  Returns an error if we cannot. */
+const char *reduce_num_flows(const tal_t *ctx,
+			     const struct route_query *rq,
+			     struct flow ***flows,
+			     struct amount_msat deliver,
+			     size_t num_parts);
 #endif /* LIGHTNING_PLUGINS_ASKRENE_REFINE_H */
diff --git a/tests/test_askrene.py b/tests/test_askrene.py
index 59e0eb872..08b870547 100644
--- a/tests/test_askrene.py
+++ b/tests/test_askrene.py
@@ -1782,8 +1782,7 @@ def test_simple_dummy_channel(node_factory):
 
 def test_maxparts_infloop(node_factory, bitcoind):
     # Three paths from l1 -> l5.
-    # FIXME: enhance explain_failure!
-    l1, l2, l3, l4, l5 = node_factory.get_nodes(5, opts=[{'broken_log': 'plugin-cln-askrene.*the obvious route'}] + [{}] * 4)
+    l1, l2, l3, l4, l5 = node_factory.get_nodes(5)
 
     for intermediate in (l2, l3, l4):
         node_factory.join_nodes([l1, intermediate, l5])
@@ -1805,16 +1804,14 @@ def test_maxparts_infloop(node_factory, bitcoind):
                              final_cltv=5)
     assert len(route['routes']) == 3
 
-    # Now with maxparts == 2.  Usually askrene can't figure out why it failed,
-    # but sometimes it gets a theory.
-    with pytest.raises(RpcError):
-        l1.rpc.getroutes(source=l1.info['id'],
-                         destination=l5.info['id'],
-                         amount_msat=amount,
-                         layers=[],
-                         maxfee_msat=amount,
-                         final_cltv=5,
-                         maxparts=2)
+    route = l1.rpc.getroutes(source=l1.info['id'],
+                             destination=l5.info['id'],
+                             amount_msat=amount,
+                             layers=[],
+                             maxfee_msat=amount,
+                             final_cltv=5,
+                             maxparts=2)
+    assert len(route['routes']) == 2
 
 
 def test_askrene_timeout(node_factory, bitcoind):