<?xml version="1.0" encoding="utf-8"?>
<!-- 
     draft-rfcxml-general-template-annotated-00
  
     This template includes examples of most of the features of RFCXML with comments explaining 
     how to customise them, and examples of how to achieve specific formatting.
     
     Documentation is at https://authors.ietf.org/en/templates-and-schemas
-->
<?xml-model href="rfc7991bis.rnc"?>  <!-- Required for schema validation and schema-aware editing -->
<!-- <?xml-stylesheet type="text/xsl" href="rfc2629.xslt" ?> --> 
<!-- This third-party XSLT can be enabled for direct transformations in XML processors, including most browsers -->

<!DOCTYPE rfc [
  <!ENTITY nbsp    "&#160;">
  <!ENTITY zwsp   "&#8203;">
  <!ENTITY nbhy   "&#8209;">
  <!ENTITY wj     "&#8288;">
]>
<!-- If further character entities are required then they should be added to the DOCTYPE above.
     Use of an external entity file is not recommended. -->

<rfc
  xmlns:xi="http://www.w3.org/2001/XInclude"
  category="info"
  docName="draft-agt-rtgwg-dragonfly-routing-01"
  ipr="trust200902"
  obsoletes=""
  updates=""
  submissionType="IETF"
  xml:lang="en"
  version="3">
<!-- 
    * docName should be the name of your draft
    * category should be one of std, bcp, info, exp, historic
    * ipr should be one of trust200902, noModificationTrust200902, noDerivativesTrust200902, pre5378Trust200902
    * updates can be an RFC number as NNNN
    * obsoletes can be an RFC number as NNNN 
-->

  <front>
    <title>Routing in Dragonfly+ Topologies</title> <!-- https://authors.ietf.org/en/rfcxml-vocabulary#title-4 -->
    <!--  The abbreviated title is required if the full title is longer than 39 characters -->

    <seriesInfo name="Internet-Draft" value="draft-agt-rtgwg-dragonfly-routing-01"/> <!-- https://authors.ietf.org/en/rfcxml-vocabulary#seriesinfo -->
    <!-- Set value to the name of the draft  -->

    <author fullname="Dmitry Afanasiev" initials="D." surname="Afanasiev"> <!-- https://authors.ietf.org/en/rfcxml-vocabulary#author -->
    <!-- initials should not include an initial for the surname -->
    <!-- role="editor" is optional -->
    <!-- Can have more than one author --> 
    <!-- all of the following elements are optional -->
      <address> <!-- https://authors.ietf.org/en/rfcxml-vocabulary#address -->
        <email>dmitry.afanasiev@gmail.com</email>  
      </address>
    </author>

    <author fullname="Roman Glebov" initials="R." surname="Glebov"> <!-- https://authors.ietf.org/en/rfcxml-vocabulary#author -->
      <organization>Yandex</organization> <!-- https://authors.ietf.org/en/rfcxml-vocabulary#organization -->
      <address> <!-- https://authors.ietf.org/en/rfcxml-vocabulary#address -->
        <email>kitaro630@yandex.ru</email>  
      </address>
    </author>  

    <author fullname="Jeff Tantsura" initials="J." surname="Tantsura"> <!-- https://authors.ietf.org/en/rfcxml-vocabulary#author -->
      <organization>Nvidia</organization> <!-- https://authors.ietf.org/en/rfcxml-vocabulary#organization -->
      <address> <!-- https://authors.ietf.org/en/rfcxml-vocabulary#address -->
        <email>jefftant.ietf@gmail.com</email>  
      </address>
    </author>  


    <date year="2024" month="3" day="4"/> <!-- https://authors.ietf.org/en/rfcxml-vocabulary#date -->
    <!-- On draft subbmission:
         * If only the current year is specified, the current day and month will be used.
         * If the month and year are both specified and are the current ones, the current day will
           be used
         * If the year is not the current one, it is necessary to specify at least a month and day="1" will be used.
    -->

    <area>Routing</area>
    <workgroup>Routing Area Working Group</workgroup>
    <!-- "Internet Engineering Task Force" is fine for individual submissions.  If this element is 
          not present, the default is "Network Working Group", which is used by the RFC Editor as 
          a nod to the history of the RFC Series. -->
    
    <keyword>dragonfly</keyword>
    <!-- Multiple keywords are allowed.  Keywords are incorporated into HTML output files for 
         use by search engines. -->

    <abstract>
      <t>This document provides an overview of Dragonfly+ network topology and describes routing implementation for IP networks with Dragonfly+ topology with support for non-minimal routing.t</t>
    </abstract>
 
  </front>

  <middle>
    
    <section>
    <!-- The default attributes for <section> are numbered="true" and toc="default" -->
      <name>Introduction</name>
      <t>Dragonfly [KIM2008] is a high-scalability, low-diameter, cost-efficient network topology that provides high bandwidth and large path diversity.  Dragonfly topology was originally designed for HPC and supercomputing systems and is now adopted in more and more supercomputing networks. Its properties also make it an interesting candidate for data center network topology, especially Dragonfly+ variant [SPHINER2017] with leaf-spine intra-group topology. But building IP networks with  Dragonfly+ topology is a non-trivial problem because IP networks lack many mechanisms traditionally available in HPC interconnection networks. Specifically , Dragonfly+ relies heavily on non-minimal routing and adaptive load balancing for efficient use of available  network capacity.</t>
      
      <section anchor="requirements">
      <!-- anchor is an optional attribute -->
        <name>Requirements Language</name>
        <t>The key words "MUST", "MUST NOT", "REQUIRED", "SHALL",
          "SHALL NOT", "SHOULD", "SHOULD NOT", "RECOMMENDED", "NOT
          RECOMMENDED", "MAY", and "OPTIONAL" in this document are to be
          interpreted as described in BCP 14 <xref target="RFC2119"/>
          <xref target="RFC8174"/> when, and only when, they appear in
          all capitals, as shown here.</t>
      </section>
      <!-- The 'Requirements Language' section is optional -->
    </section>
    
    <section>
      <name>Terminology</name>
      <t>This section introduces the terminology used in this document.</t>
      <dl newline="true" spacing="normal">
          <dt>Group</dt>
          <dd>building block of Dragonfly network, collection of nodes  connected by local links. In practical deployments, routers and associated end-points belonging to a group are assumed to be compactly colocated.</dd>
          <dt>Local (L) / intra-group link</dt>
          <dd>Link between routers in the same group. In  Dragonfly+ group is a leaf-spine network (bipartite graph) so local links are always between leaf and spine.</dd>
          <dt>Global (G) / inter-group link</dt>
          <dd>Links between routers from different groups. Usually long and more expensive so it is desirable to minimize the number of global links.</dd>
          <dt>Path signature</dt>
          <dd>Sequence of letters corresponding to types of links in the path, e.g. LGLLGL.</dd>
          <dt>Local / intra-group network</dt>
          <dd></dd>
          <dt>Global / inter-group network</dt>
          <dd></dd>
          <dt>MIN</dt>
          <dd>Minimal routing</dd>
          <dt>VAL</dt>
          <dd>Randomized non-minimal routing (valiant load balanced)</dd>
          <dt>AR</dt>
          <dd>Adaptive routing. Name is misleading because it has nothing to do with disseminating reachability information - it is a mapping mechanism that maps traffic to already known paths.</dd>
          <dt>UGAL</dt>
          <dd>Universal Globally-Adaptive load-balanced</dd>
          <dt>UGAL-L</dt>
          <dd>UGAL with using local queue information at current router node</dd>
          <dt>UGAL-G</dt>
          <dd>UGAL using global information</dd>
          <dt>ARN</dt>
          <dd>Adaptive Routing Notification</dd>
      </dl>
    </section>

    <section>
      <name>Network Design Requirements</name>
      <t>Network design requirements are largely the same as in [RFC7938]. The most notable difference is the extensive use of non-minimal paths.</t>
    </section>


    <section>
      <name>Dragonfly Topology</name>
      <t>Body text</t>

        <section>
        <name>Dragonfly Topology Overview</name>
        <t>Dragonfly topology was introduced by Kim et al. [KIM2008]. It aims to decrease the cost and diameter of the network while providing good scalability. Dragonfly is a hierarchical  topology that divides routers into groups connected by long (inter-group) links in a fully-connected global network. Each group essentially implements high-radix virtual router. Dragonfly is a direct topology, in which every router has a set of terminal connections leading to endpoints, and a set of topological connections leading to other routers, some from the same group and some from the other groups.  While original Dragonfly uses fully-connected intra-group topology it doesn't prevent using  other intra-group topologies. Different intra-group topologies produce different Dragonfly "flavors". Inter-group topology is always fully connected.

        Dragonfly+ as proposed in [SPHINER2017] relies on an extended group topology in which intra-group routers are connected as a bipartite graph (leaf-spine or Clos-like topology). Dragonfly+ is superior to conventional Dragonfly due to the significantly larger number of hosts which it is able to support. In addition, Dragonfly+ supports similar or better bisectional bandwidth for various traffic patterns and requires smaller number of buffers to avoid credit loop deadlocks in lossless networks. Dragonfly+ is a indirect topology where only leaf nodes are connect to endpoints.
        
        TODO: spine sizing.</t>
        </section>

        <section>
        <name>Rouging and Paths in Dragonfly+</name>
        <t>In Dragonfly and Dragonfly+ topologies there exists at least one direct global link between every pair of groups. Minimal intergroup routes traverse a single global link. The capacity of minimal routes between each pair of groups is lower than the aggregate link capacity  of hosts in a group. Therefore, conventional minimal routing is not enough to obtain maximal throughput and efficiently support various traffic patters. [KIM2008] introduces the concept of non-minimal adaptive routing. 
        For Dragonfly+ we can define three priority levels of inter-group routes. We use notations of ”L” and ”G” below to express where the route traverses local or global
        link, respectively.</t>
        <ol>
          <li>High priority: Minimal route (LGL) - a shortest distance route which passes through two spine routers using a single global link.</li>
          <li>Medium priority: Intermediate spine route (LGGL) - a route which traverses an intermediate group, using its spine router, passing exactly three spine routers using two global links.</li>
          <li>Low priority: Intermediate leaf route (LGLLGL) - a route which traverses an intermediate group using its two spine routers and a leaf router, passing exactly four spine routers using two global links.</li>
        </ol>
        <t>LGLLGL routes normally appear only when some spines are not connected to at least one spine in every other group - in this case non-minimal routes through intermediate group might need to use different ingress and egress spines in the intermediate group.
        
        TODO: discuss imbalance, density and LGLLGL routes [WILKE2017]</t>
        </section>

        <section>
        <name>Topology Construction and Graph Wiring</name>
        <t>One possible implementation is described in [WILKE2017].
        TODO: describe wiring scheme invariant under group rotation (consistent renumbering of all groups by the same offset mod number of groups).</t>
        </section>

        <section>
        <name>Adaptive Load Balancing</name>
        <t>While routing and forwarding setup described in this document allows to propagate reachability information and install forwarding state required for  Dragonfly+ topologies, including non-minimal paths, it's not enough to efficiently use Dragonfly network capacity, especially in presence of LGLLGL paths. Efficient traffic to paths mapping in Dragonfly network can not be described by static mechanisms because ideally we would like to</t>
        <ul>
            <li>fill paths starting from high priority</li>
            <li>try to move flows from congested paths as a possible reaction to congestion</li>
        </ul>
        <t>This requires dynamic adaptive load balancing and coupling between adaptive load balancing and congestion control. Adaptive load balancing MUST be able to work  without complete knowledge of network link utilization and queue state since such state can significantly change over the period of several RTTs and collecting and distributing global network utilization information often enough in any network of practically interesting size in infeasible.  
        Adaptive routing can also work as a complementary failure handling mechanism with much faster reaction time than routing convergence. 
        TODO: separate document describing possible adaptive load balancing implementation using existing mechanisms.</t>
        </section>

    </section>


    <section>
      <name>Routing and Forwarding</name>
      <t>This section describes routing design supporting non-minimal paths. It uses only existing mechanisms - VRFs, route leaking and EBGP as a routing protocol. EBGP is chosen for scalability and flexibility - routing policies and communities allow to implement additional logic and precisely control propagation of routing updates. 
      Routing design is based on following principles:</t>
        <ul spacing="normal">
          <li>intra-group traffic MUST use minimal routing as group in Dragonfly+ is just a leaf-spine network</li>
          <li>path can contain at most one transit group</li>
          <li>transit spine(s) MUST use shortest path forwarding to avoid forwarding loops</li>
          <li>LGLLGL paths require traffic reflection via leaves in the transit group but only appear if number of uplinks per spine is less than number of remote groups</li>
        </ul>

        <section>
        <name>Forwarding</name>
        <t>To achieve desired forwarding behavior several VRFs are configured on every spine:</t>
          <ul spacing="normal">
            <li>local VRF in each group containing local links</li>
            <li>core VRF containing all global links</li>
          </ul>
        <t>Additional VRF serving as a virtual link is configured if network is using  LGLLGL paths - "reflect" VRF in each group containing local links. Since both local VRF and reflect VRF include leaf-spine links some form of VRF multiplexing over leaf-spine links is required when LGLLGL paths are used. Additional VRF serving as a virtual link is configured if network is using  LGLLGL paths 
        - reflect VRF in each group containing local links. Since both local VRF and reflect VRF include leaf-spine links some form of VRF multiplexing over leaf-spine links is required when LGLLGL paths are used.
        Local VRF:
        - imports minimal and non-minmal paths from the core VRF and installs them
        Core VRF
        - imports locally originated paths from local VRF in each group
        - imports transit paths from reflect VRF
        Reflect VRF
        - imports minimal paths from 	`core VRF
        </t>
        </section>

        <section>
        <name>Routing</name>
        <t>Each group is in a separate AS. 
        Communities, routing policies and update propagation:</t>
        <ul spacing="normal">
          <li>When a announcing a route originated in the local group towards other groups add community C1</li>
          <li>When propagating announce with community C1 add community C2</li>
          <li>Do not propagate updates with community C2 </li>
          <li>Import routes with C1 and C2 into local VRFs</li>
          <li>Import routes with C1 only into reflect VRFs, add community C3</li>
          <li>Import routes with C3 from reflect VRFs into core VRF</li>
        </ul>

        <t>During import into local VRFs prepend ASPATH:</t>
        <ul spacing="normal">
          <li>2 times for routes with  C1 only</li>
          <li>1 time for routes with C2</li>
          <li>do not prepend for routes with C3</li>
        </ul>
        <t>As result  paths with C1, C2 and C3 will all have has the same ASPATH length in local VRFs and will be  eligible for ECMP.</t>
        </section>

        <section>
        <name>Scalability and Optimizations</name>
        <t>TODO</t>
        </section>

            <section>
            <name>Failure handling and convergence</name>
            <t>TODO</t>
            </section>

        <section>
        <name>Asymmetry and traffic engineering</name>
        <t>Body text</t>
        </section>

    </section>

    
    <section anchor="IANA">
    <!-- All drafts are required to have an IANA considerations section. See RFC 8126 for a guide.-->
      <name>IANA Considerations</name>
      <t>This memo includes no request to IANA.</t>
    </section>
    
    <section anchor="Security">
      <!-- All drafts are required to have a security considerations section. See RFC 3552 for a guide. -->
      <name>Security Considerations</name>
      <t>This document should not affect the security of the Internet.</t>
    </section>
    
    <!-- NOTE: The Acknowledgements and Contributors sections are at the end of this template -->
  </middle>

  <back>
    <references>
      <name>References</name>
      <references>
        <name>Normative References</name>
        
        <xi:include href="https://bib.ietf.org/public/rfc/bibxml/reference.RFC.8174.xml"/>
        <!-- The recommended and simplest way to include a well known reference -->
        
      </references>
 
      <references>
        <name>Informative References</name>
       
        <reference anchor="RFC2119" target="https://www.rfc-editor.org/info/rfc2119">
        <!-- Manually added reference -->
          <front>
            <title>Key words for use in RFCs to Indicate Requirement Levels</title>
            <author initials="S." surname="Bradner" fullname="S. Bradner">
              <organization/>
            </author>
            <date year="1997" month="March"/>
            <abstract>
              <t>In many standards track documents several words are used to signify the requirements in the specification. These words are often capitalized. This document defines these words as they should be interpreted in IETF documents. This document specifies an Internet Best Current Practices for the Internet Community, and requests discussion and suggestions for improvements.
              </t>
            </abstract>
          </front>
          <seriesInfo name="BCP" value="14"/>
          <seriesInfo name="RFC" value="2119"/>
          <seriesInfo name="DOI" value="10.17487/RFC2119"/>
        </reference>
       
        <reference anchor="RFC7938" target="https://www.rfc-editor.org/info/rfc7938">
        <!-- Manually added reference -->
          <front>
            <title>Use of BGP for Routing in Large-Scale Data Centers</title>
            <author initials="S." surname="Bradner" fullname="S. Bradner">
              <organization/>
            </author>
            <date year="1997" month="March"/>
            <abstract>
              <t>In many standards track documents several words are used to signify the requirements in the specification. These words are often capitalized. This document defines these words as they should be interpreted in IETF documents. This document specifies an Internet Best Current Practices for the Internet Community, and requests discussion and suggestions for improvements.
              </t>
            </abstract>
          </front>
          <seriesInfo name="BCP" value="14"/>
          <seriesInfo name="RFC" value="2119"/>
          <seriesInfo name="DOI" value="10.17487/RFC2119"/>
        </reference>
       

        <reference anchor="KIM2008" target="https://doi.org/10.1109/ISCA.2008.19">
            <front>
                <title>Technology-Driven, Highly-Scalable Dragonfly Topology</title>
                <author fullname="John Kim" initials="J." surname="Kim"/>
                <author fullname="William J. Dally" initials="W. J." surname="Dally"/>
                <author fullname="Steve Scott" initials="S." surname="Scott"/>
                <author fullname="Dennis Abts" initials="D." surname="Abts"/>
            <date year="2008"/>
          </front>
        </reference>

        <reference anchor="SPHINER2017" target="http://dx.doi.org/10.1109/HiPINEB.2017.11">
            <front>
                <title>Dragonfly+: Low Cost Topology for Scaling Datacenters</title>
                <author fullname="Alexander Shpiner"/>
                <author fullname="Zachy Haramaty"/>
                <author fullname="Saar Eliad"/>
                <author fullname="Vladimir Zdornov"/>
                <author fullname="Barak Gafni"/>
                <author fullname="Eitan Zahavi"/>
            <date year="2017" month="February"/>
          </front>
        </reference>

        <reference anchor="FLAJSLIK2018" target="https://doi.org/10.1007/978-3-319-92040-5_15">
            <front>
                <title>Megafly: A Topology for Exascale Systems</title>
                <author fullname="Mario Flajslik"/>
                <author fullname="Eric Borch"/>
                <author fullname="Mike A. Parker"/>
            <date year="2018" month="May"/>
          </front>
        </reference>

        <reference anchor="WILKE2017" target="https://www.researchgate.net/publication/320493515_Design_Space_Exploration_of_the_Dragonfly_Topology">
            <front>
                <title>Design space exploration of the Dragonfly topology</title>
                <author fullname="Wilke, Jeremiah J"/>
                <author fullname="Rumley, Sebastien"/>
                <author fullname="Teh, Min Yee"/>
            <date year="2017"/>
          </front>
        </reference>

        <reference anchor="SINGH2005" target="http://cva.stanford.edu/publications/2005/thesis_arjuns.pdf">
            <front>
                <title>Load-balanced routing in interconnection networks</title>
                <author fullname="Singh, Arjun">
                    <organization>Stanford University</organization>
                </author>
            <date year="2005"/>
          </front>
        </reference>       

      </references>
    </references>
    
    
 </back>
</rfc>
