<?xml version="1.0" encoding="US-ASCII"?>
<!-- This template is for creating an Internet Draft using xml2rfc,
    which is available here: http://xml.resource.org. -->
<!DOCTYPE rfc SYSTEM "rfc2629.dtd" [
<!-- One method to get references from the online citation libraries.
    There has to be one entity for each item to be referenced. 
    An alternate method (rfc include) is described in the references. -->
<!ENTITY RFC2119 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.2119.xml">
<!ENTITY RFC2629 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.2629.xml">
<!ENTITY RFC3552 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.3552.xml">
<!ENTITY I-D.narten-iana-considerations-rfc2434bis SYSTEM "http://xml.resource.org/public/rfc/bibxml3/reference.I-D.narten-iana-considerations-rfc2434bis.xml">
]>
<?xml-stylesheet type='text/xsl' href='rfc2629.xslt' ?>
<!-- used by XSLT processors -->
<!-- For a complete list and description of processing instructions (PIs), 
    please see http://xml.resource.org/authoring/README.html. -->
<!-- Below are generally applicable Processing Instructions (PIs) that most I-Ds might want to use.
    (Here they are set differently than their defaults in xml2rfc v1.32) -->
<?rfc strict="yes" ?>
<!-- give errors regarding ID-nits and DTD validation -->
<!-- control the table of contents (ToC) -->
<?rfc toc="yes"?>
<!-- generate a ToC -->
<?rfc tocdepth="4"?>
<!-- the number of levels of subsections in ToC. default: 3 -->
<!-- control references -->
<?rfc symrefs="yes"?>
<!-- use symbolic references tags, i.e, [RFC2119] instead of [1] -->
<?rfc sortrefs="yes" ?>
<!-- sort the reference entries alphabetically -->
<!-- control vertical white space 
    (using these PIs as follows is recommended by the RFC Editor) -->
<?rfc compact="yes" ?>
<!-- do not start each main section on a new page -->
<?rfc subcompact="no" ?>
<!-- keep one blank line between list items -->
<!-- end of list of popular I-D processing instructions -->
<rfc category="std" docName="draft-xu-idr-fare-04" ipr="trust200902">
  <front>
    <title abbrev="FARE using BGP">Fully Adaptive Routing Ethernet using
    BGP</title>

    <author fullname="Xiaohu Xu" initials="X." surname="Xu">
      <organization>China Mobile</organization>

      <address>
        <email>xuxiaohu_ietf@hotmail.com</email>
      </address>
    </author>

    <author fullname="Shraddha Hegde" initials="S." surname="Hegde">
      <organization>Juniper</organization>

      <address>
        <postal>
          <street/>

          <city/>

          <region/>

          <code/>

          <country/>
        </postal>

        <phone/>

        <facsimile/>

        <email>shraddha@juniper.net</email>

        <uri/>
      </address>
    </author>

    <author fullname="Keyur Patel" initials="K." surname="Patel ">
      <organization>Arrcus</organization>

      <address>
        <postal>
          <street/>

          <city/>

          <region/>

          <code/>

          <country/>
        </postal>

        <phone/>

        <facsimile/>

        <email>keyur@arrcus.com</email>

        <uri/>
      </address>
    </author>

    <author fullname="Zongying He" initials="Z." surname="He">
      <organization>Broadcom</organization>

      <address>
        <email>zongying.he@broadcom.com</email>
      </address>
    </author>

    <author fullname="Junjie Wang" initials="J." surname="Wang">
      <organization>Centec</organization>

      <address>
        <email>wangjj@centec.com</email>
      </address>
    </author>

    <author fullname="Hongyi Huang" initials="H." surname="Huang">
      <organization>Huawei</organization>

      <address>
        <email>hongyi.huang@huawei.com</email>
      </address>
    </author>

    <author fullname="Qingliang Zhang" initials="Q." surname="Zhang">
      <organization>H3C</organization>

      <address>
        <email>zhangqingliang@h3c.com</email>
      </address>
    </author>

    <author fullname="Hang Wu" initials="H." surname="Wu">
      <organization>Ruijie Networks</organization>

      <address>
        <email>wuhang@ruijie.com.cn</email>
      </address>
    </author>

    <author fullname="Yadong Liu" initials="Y." surname="Liu">
      <organization>Tencent</organization>

      <address>
        <email>zeepliu@tencent.com</email>
      </address>
    </author>

    <author fullname="Yinben Xia" initials="Y." surname="Xia">
      <organization>Tencent</organization>

      <address>
        <email>forestxia@tencent.com</email>
      </address>
    </author>

    <author fullname="Peilong Wang" initials="P." surname="Wang">
      <organization>Baidu</organization>

      <address>
        <email>wangpeilong01@baidu.com</email>
      </address>
    </author>

    <author fullname="Tiezheng Li" initials="T." surname="Li">
      <organization>IEIT SYSTEMS</organization>

      <address>
        <email>litiezheng@ieisystem.com</email>
      </address>
    </author>

    <!--

-->

    <date day="18" month="December" year="2025"/>

    <abstract>
      <t>Large language models (LLMs) like ChatGPT have become increasingly
      popular in recent years due to their impressive performance in various
      natural language processing tasks. These models are built by training
      deep neural networks on massive amounts of text data, as well as visual
      and video data, and often consist of billions or even trillions of
      parameters. However, the training process for these models can be
      extremely resource-intensive, requiring the deployment of thousands or
      even tens of thousands of GPUs in a single AI training cluster.
      Therefore, three-stage or even five-stage CLOS networks are commonly
      adopted for AI networks. The non-blocking nature of the network becomes
      increasingly critical for large-scale AI model training. Therefore,
      adaptive routing is necessary to dynamically distribute traffic to the
      same destination across multiple equal-cost paths, based on network
      capacity and even congestion information along those paths.</t>
    </abstract>

    <note title="Requirements Language">
      <t>The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT",
      "SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY", and "OPTIONAL" in this
      document are to be interpreted as described in <xref
      target="RFC2119">RFC 2119</xref>.</t>
    </note>
  </front>

  <middle>
    <section title="Introduction">
      <t>Large language models (LLMs) like ChatGPT have become increasingly
      popular in recent years due to their impressive performance in various
      natural language processing tasks. These models are built by training
      deep neural networks on massive amounts of text data, as well as visual
      and video data, and often consist of billions or even trillions of
      parameters. However, the training process for these models can be
      extremely resource-intensive, requiring the deployment of thousands or
      even tens of thousands of GPUs in a single AI training cluster.
      Therefore, three-stage or even five-stage CLOS networks are commonly
      adopted for AI networks. Furthermore, in rail-optimized CLOS network
      topologies with standard GPU servers (an HB domain of eight GPUs), the
      Nth GPUs of each server in a group of servers are connected to the Nth
      leaf switch, which provides higher bandwidth and non-blocking
      connectivity between the GPUs in the same rail. In a rail-optimized
      network topology, most traffic between GPU servers traverses the
      intra-rail networks rather than the inter-rail networks. In addition,
      whether in rail-optimized or rail-free networks, collective
      communication job schedulers always opt to schedule jobs with network
      topology awareness to minimize the amount of traffic going to the upper
      layers of the network.</t>

      <t>The non-blocking nature of the network, particularly at the lower
      layers, is essential for large-scale AI training clusters. AI workloads
      are usually very bandwidth-hungry and often generate several large data
      flows simultaneously. If traditional hash-based ECMP load balancing is
      used without optimization, it can lead to serious congestion and high
      latency in the network when multiple large data flows are directed to
      the same link. This congestion can result in longer-than-expected model
      training times, as job completion time depends on worst-case
      performance. Therefore, adaptive routing is necessary to dynamically
      distribute traffic to the same destination across multiple equal-cost
      paths, taking into account network capacity and even congestion along
      these paths. In essence, adaptive routing is a capacity- and even
      congestion-aware dynamic path selection algorithm that optimizes traffic
      distribution for high-throughput AI training workloads.</t>

      <t>Furthermore, to minimize congestion risk to the greatest extent, the
      routing mechanism should be designed with finer granularity whenever
      possible. Flow-granular adaptive routing still carries a statistical
      probability of congestion. Therefore, packet-granular adaptive routing
      is more desirable, even though packet spraying would introduce
      out-of-order packet delivery issues. A flexible reordering mechanism
      must be implemented (e.g., at egress Top-of-Rack (ToR) switches or
      receiving servers). Recent optimizations for RDMA over Converged
      Ethernet (RoCE) and newly developed transport protocols as RoCE
      alternatives no longer require out-of-order delivery handling at the
      network layer; instead, this function is offloaded to the message
      processing layer. </t>

      <t>To enable adaptive routing&mdash;whether flow-granular or
      packet-granular&mdash;it is necessary to propagate network topology
      information (including link capacity and path capacity) across the CLOS
      network. Therefore, it is straightforward to adopt link-state protocols
      such as OSPF or IS-IS as the underlay routing protocol in CLOS networks,
      rather than BGP. How to leverage OSPF or ISIS to achieve adaptive
      routing has been described in <xref target="I-D.xu-lsr-fare"/>. However,
      some data center network operators have been used to the use of BGP as
      the underlay routing protocol of data center networks <xref
      target="RFC7938"/>. Therefore, there does exist a need to leverage BGP
      to achieve adaptive routing as well.</t>

      <t>Hence, this document defines a new extended community referred to as
      Path Bandwidth Extended Community, and describes how to use this
      extended community to carry end-to-end path bandwidth within the data
      center fabric so as to achieve adaptive routing. </t>

      <t>Note that while adaptive routing, especially at the packet-granular
      level can help reduce congestion between switches in the network,
      thereby achieving a non-blocking fabric, it does not address the incast
      congestion issue which is commonly experienced in last-hop switches that
      are connected to the receivers in many-to-one communication patterns.
      Therefore, a congestion control mechanism is always necessary between
      the sending and receiving servers to mitigate such congestion.</t>

      <section title="Comparison with Related Works">
        <t><xref target="I-D.ietf-idr-link-bandwidth"/> outlines a method for
        implementing weighted ECMP load-balancing based on the bandwidth of
        the EXTERNAL (DMZ) link, which is conveyed in the non-transitive link
        bandwidth extended community. However, it is not feasible to enable
        adaptive routing directly using the non-transitive link bandwidth
        extended community due to the following constraints mentioned in <xref
        target="I-D.ietf-idr-link-bandwidth"/>. "No more than one link
        bandwidth extended community SHALL be attached to a route.
        Additionally, if a route is received with a link bandwidth extended
        community and the BGP speaker sets itself as next-hop while announcing
        that route to other peers, the link bandwidth extended community
        should be removed. The extended community is optional
        non-transitive."</t>

        <t><xref target="I-D.ietf-bess-ebgp-dmz"/> removes the previous
        restriction that the EXTERNAL (DMZ) link bandwidth extended community
        could not be sent across AS boundaries. Additionally, when receiving
        multiple equal-cost BGP paths towards the external network (e.g., the
        WAN), the best path among them will be advertised to eBGP peers with
        the transitive link bandwidth extended community filled with the
        cumulative bandwidth of the multiple external links. Since the
        approach as described in this document is based on the assumption that
        "The total BW available towards WAN is significantly lower than the
        total BW within the fabric,&rdquo; the internal path bandwidth within
        the fabric is not taken into account when performing weighted ECMP
        load-balancing.</t>

        <t><xref target="I-D.ietf-bess-evpn-unequal-lb"/> describes an
        EVPN-dedicated extended community and an EVPN link-bandwidth sub-type
        of the above EVPN-dedicated extended community for EVPN weighted ECMP
        load-balancing. Additionally, the document defines different ways to
        express the link bandwidth.</t>

        <t>The three previous documents explain how to use the extended
        community to carry the bandwidth of the external links towards the
        outside of the fabric (such as WAN, services bound to anycast address,
        or multi-homed VPN sites) for weighted ECMP load-balancing. In
        contrast, this document explains how to use the extended community to
        carry the end-to-end path bandwidth within the data center fabric for
        weighted ECMP load-balancing.</t>
      </section>
    </section>

    <section anchor="Abbreviations_Terminology" title="Terminology">
      <t>This memo makes use of the terms defined in <xref
      target="RFC4360"/>.</t>
    </section>

    <section title="Path Bandwidth Extended Community">
      <t>The Path Bandwidth Extended Community is used to indicate the minimum
      bandwidth of the path towards the destination. It is a new IPv4 Address
      Specific Extended Community that can be transitive or
      non-transitive.</t>

      <t>The value of the high-order octet of this extended type is either
      0x01 or 0x41. The low-order octet of this extended type is TBD.</t>

      <t>The Value field consists of two sub-fields:</t>

      <t><list>
          <t>Global Administrator sub-field: This sub-field contains the
          router ID of the advertising router that appends the path bandwidth
          extended community or updates the path bandwidth value of the
          existing path bandwidth extended community.</t>

          <t>Local Administrator sub-field: This sub-field contains the path
          bandwidth value in IEEE floating point format with units of
          Gigabytes per second (GB/s).</t>
        </list></t>
    </section>

    <section title="Solution Description">
      <t/>

      <section title="Adaptive Routing in 3-stage CLOS">
        <t><figure>
            <artwork align="center"><![CDATA[      
   +----+ +----+ +----+ +----+  
   | S1 | | S2 | | S3 | | S4 |  (Spine)
   +----+ +----+ +----+ +----+             
             
   +----+ +----+ +----+ +----+ +----+ +----+ +----+ +----+
   | L1 | | L2 | | L3 | | L4 | | L5 | | L6 | | L7 | | L8 |  (Leaf)
   +----+ +----+ +----+ +----+ +----+ +----+ +----+ +----+ 


                              Figure 1]]></artwork>
          </figure></t>

        <t>(Note that the diagram above does not include the connections
        between nodes. However, it can be assumed that leaf nodes are
        connected to every spine node in the above CLOS topology.)</t>

        <t>In a three-stage CLOS network as shown in Figure 1, also known as a
        leaf-spine network, each leaf node would establish eBGP sessions with
        all spine nodes.</t>

        <t>All nodes are enabled for adaptive routing.</t>

        <t>When a leaf node, such as L1, advertises the route to a specific IP
        prefix that it originates, it will attach a transitive path bandwidth
        extended community filled with a maximum bandwidth value.</t>

        <t>Upon receiving the above advertisement, a spine node, such as S1,
        SHOULD determine the minimum value between the bandwidth of the link
        towards the advertising node (e.g., L1) and the value of the path
        bandwidth extended community carried in the received route, and then
        update the path bandwidth extended community with the above minimum
        value before readvertising that route to remote eBGP peers. Once S1
        receives multiple equal-cost routes for a given prefix from multiple
        leaf nodes (e.g., L1 and L2 in the server multi-homing scenario), for
        each route, it SHOULD determine the minimum value between the
        bandwidth of the link towards the advertising node and the value of
        the path bandwidth extended community carried in the received route,
        and then use that minimum bandwidth value as a weight value for that
        route when performing weighted ECMP load-balancing. When readvertising
        the route for that prefix to remote eBGP peers further, the path
        bandwidth extended community would be updated with the sum of the
        minimum bandwidth value of each route.</t>

        <t>When a leaf node, such as L8, receives multiple equal-cost routes
        for that prefix from spine nodes (e.g., S1, S2, S3 and S4), for each
        route, it will determine the minimum value between the bandwidth of
        the link towards the advertising node and the value of the path
        bandwidth extended community carried in the received route, and then
        use that minimum bandwidth value as a weight value for that route when
        performing weighted ECMP load-balancing. Note that when a given IP
        prefix is multi-homed to multiple leaf nodes (e.g., two leaf nodes),
        the value of the path bandwidth extended community carried in the
        received route SHOULD be divided by the number of multi-homed leaf
        nodes (e.g., two) before determining the minimum value. Alternatively,
        the bandwidth of the link towards the advertising node SHOULD be
        multiplied by the number of multi-homed leaf nodes (e.g., two) before
        determining the minimum value.</t>

        <t>Note that weighted ECMP load-balancing according to path bandwidth
        SHOULD NOT be performed unless all equal-cost routes for a given
        prefix carry path bandwidth extended community.</t>
      </section>

      <section title="Adaptive Routing in 5-stage CLOS">
        <t><figure>
            <artwork align="center"><![CDATA[      
   =========================================         
   # +----+ +----+ +----+ +----+           #
   # | L1 | | L2 | | L3 | | L4 | (Leaf)    #
   # +----+ +----+ +----+ +----+           #
   #                                PoD-1  #
   # +----+ +----+ +----+ +----+           #
   # | S1 | | S2 | | S3 | | S4 | (Spine)   #
   # +----+ +----+ +----+ +----+           #
   =========================================

   ===============================     ===============================
   # +----+ +----+ +----+ +----+ #     # +----+ +----+ +----+ +----+ #
   # |SS1 | |SS2 | |SS3 | |SS4 | #     # |SS1 | |SS2 | |SS3 | |SS4 | #
   # +----+ +----+ +----+ +----+ #     # +----+ +----+ +----+ +----+ #
   #   (Super-Spine@Plane-1)     #     #   (Super-Spine@Plane-4)     #
   #============================== ... ===============================

   =========================================         
   # +----+ +----+ +----+ +----+           #
   # | S1 | | S2 | | S3 | | S4 | (Spine)   #
   # +----+ +----+ +----+ +----+           #
   #                                PoD-8  #
   # +----+ +----+ +----+ +----+           #
   # | L1 | | L2 | | L3 | | L4 | (Leaf)    #
   # +----+ +----+ +----+ +----+           #
   =========================================           

                              Figure 2]]></artwork>
          </figure>(Note that the diagram above does not include the
        connections between nodes. However, it can be assumed that the leaf
        nodes in a given PoD are connected to every spine node in that PoD.
        Similarly, each spine node (e.g., S1) is connected to all super-spine
        nodes in the corresponding PoD-interconnect plane (e.g.,
        Plane-1).)</t>

        <t>For a five-stage CLOS network as illustrated in Figure 2, each leaf
        node would establish eBGP sessions with all spine nodes of the same
        PoD while each spine node would establish eBGP sessions with all
        super-spine nodes in the corresponding PoD-interconnect plane.</t>

        <t>When a given leaf node, such as L1@PoD-1, advertises the route for
        a specific IP prefix that it originates, it will attach a transitive
        path bandwidth extended community filled with a maximum bandwidth
        value.</t>

        <t>Upon receiving the above route advertisement, a spine node, such as
        S1@PoD-1, will determine the minimum value between the bandwidth of
        the link towards the advertising node (e.g., L1@PoD-1) and the value
        of the path bandwidth extended community carried in the route, and
        then update the path bandwidth extended community with the above
        minimum value before advertising that route to its peers. Once
        S1@PoD-1 receives multiple equal-cost routes for a given prefix from
        multiple leaf nodes (e.g., L1 and L2@PoD-1 in the server multi-homing
        scenario), for each route, it will determine the minimum value between
        the bandwidth of the link towards the advertising node and the value
        of the path bandwidth extended community carried in the route, and
        then use that minimum bandwidth value as a weight value for that route
        when performing weighted ECMP load-balancing. When advertising the
        route for that prefix to remote peers further, the path bandwidth
        extended community would be updated with the sum of the bandwidth
        value of each received route.</t>

        <t>When a given super-spine node, such as SS1@Plane-1, receives the
        above route advertised from S1@PoD-1, it will not update the
        transitive path bandwidth extended community when advertising that
        route to its peers. Additionally, it COULD optionally attach another
        path bandwidth extended community which is non-transitive to indicate
        the bandwidth of the link towards the advertising router of the
        received route (i.e., S1@PoD-1).</t>

        <t>When a given spine node in another PoD, such as S1@PoD-8, receives
        multiple equal-cost routes for a given prefix from super-spine nodes
        in Plane-1 (e.g., SS1, SS2, SS3 and SS4@Plane-1), once each route
        contains a non-transitive path bandwidth extended community, for each
        route, it will determine the minimum value between the bandwidth of
        the link towards the advertising node and the bandwidth value of the
        non-transitive path bandwidth extended community carried in the route,
        and then use that minimum value as a weight value for that route when
        performing weighted ECMP load-balancing. Otherwise, it would perform
        ECMP load-balancing by default.</t>

        <t>When advertising that route to its peers, it will not update the
        value of the transitive path bandwidth extended community by default
        (Note that the transitive path bandwidth extended community of those
        multiple equal-cost routes carry the same value that was set by
        S1@PoD-1). In the case where each route contains a non-transitive path
        bandwidth extended community, the above spine node COULD optionally
        update the value of the transitive path bandwidth extended community
        with the total bandwidth value of all paths towards the next-next hop
        (e.g., the paths towards S1@PoD-1 via SS1, SS2, SS3 and SS4@Plane-1)
        if the latter is smaller than the former.</t>

        <t>When a given leaf node in PoD-8, such as L1@PoD-8, receives
        multiple equal-cost routes for that prefix from multiple spine nodes
        (e.g., S1, S2, S3 and S4@PoD-8), for each route, it will determine the
        minimum value between the bandwidth of the link towards the
        advertising node and the value of the path bandwidth extended
        community carried in the route, and then use that minimum value as a
        weight value for that route when performing weighted ECMP
        load-balancing. Note that when a given IP prefix is multi-homed to
        multiple leaf nodes (e.g., two leaf nodes), the value of the path
        bandwidth extended community carried in the received route SHOULD be
        divided by the number of multi-homed leaf nodes (e.g., two) before
        determining the minimum value. Alternatively, the bandwidth of the
        link towards the advertising node SHOULD be multiplied by the number
        of multi-homed leaf nodes (e.g., two) before determining the minimum
        value.</t>

        <t>Note that weighted ECMP load-balancing according to path bandwidth
        SHOULD NOT be performed unless all equal-cost routes for a given
        prefix carry path bandwidth extended community.</t>
      </section>
    </section>

    <section anchor="Acknowledgements" title="Acknowledgements">
      <t>TBD.</t>

      <!---->
    </section>

    <section anchor="IANA" title="IANA Considerations">
      <t>TBD.</t>
    </section>

    <section anchor="Security" title="Security Considerations">
      <t>TBD.</t>

      <!---->
    </section>
  </middle>

  <back>
    <references title="Normative References">
      <?rfc include='reference.RFC.2119'?>

      <?rfc include='reference.RFC.4360'?>

      <!---->
    </references>

    <references title="Informative References">
      <?rfc include='reference.RFC.7938'?>

      <?rfc include="reference.I-D.xu-lsr-fare"?>

      <?rfc include="reference.I-D.ietf-idr-link-bandwidth"?>

      <?rfc include="reference.I-D.ietf-bess-ebgp-dmz"?>

      <?rfc include="reference.I-D.ietf-bess-evpn-unequal-lb"?>

      <!---->
    </references>
  </back>
</rfc>
