<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE rfc SYSTEM "rfc2629.dtd" [
<!-- A set of on-line citation libraries are maintained on the xml2rfc web site.
     The next line defines an entity named RFC2629, which contains the necessary XML
     for the reference element, and is used much later in the file.  This XML contains an
     anchor (also RFC2629) which can be used to cross-reference this item in the text.
     You can also use local file names instead of a URI.  The environment variable
     XML_LIBRARY provides a search path of directories to look at to locate a
     relative path name for the file. There has to be one entity for each item to be
     referenced. -->
<!ENTITY RFC2119 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.2119.xml">
<!ENTITY RFC8174 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.8174.xml">
<!ENTITY RFC7938 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.7938.xml">
<!ENTITY RFC7752 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.7752.xml">
<!ENTITY RFC8277 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.8277.xml">
<!ENTITY RFC8667 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.8667.xml">
<!ENTITY RFC8665 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.8665.xml">
<!ENTITY RFC8669 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.8669.xml">
<!ENTITY RFC8663 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.8663.xml">
<!ENTITY RFC7911 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.7911.xml">
<!ENTITY RFC7880 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.7880.xml">
<!ENTITY RFC4364 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.4364.xml">
<!ENTITY RFC5920 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.5920.xml">
<!ENTITY RFC7011 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.7011.xml">
<!ENTITY RFC6241 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.6241.xml">
<!ENTITY RFC6020 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.6020.xml">
<!ENTITY RFC7854 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.7854.xml">
<!ENTITY RFC8300 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.8300.xml">
<!ENTITY RFC5440 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.5440.xml">
<!ENTITY RFC7348 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.7348.xml">
<!ENTITY RFC7637 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.7637.xml">
<!ENTITY RFC3031 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.3031.xml">
<!ENTITY RFC8014 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.8014.xml">
<!ENTITY RFC8402 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.8402.xml">
<!ENTITY RFC5883 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.5883.xml">
<!ENTITY RFC8231 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.8231.xml">
<!ENTITY RFC8281 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.8281.xml">
<!ENTITY RFC5925 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.5925.xml">
<!ENTITY RFC8253 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.8253.xml">
<!ENTITY RFC6790 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.6790.xml">
<!ENTITY RFC8662 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.8662.xml">
<!ENTITY RFC8491 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.8491.xml">
<!ENTITY RFC8476 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.8476.xml">
<!-- There is also a library of current Internet Draft citations.  It isn't a good idea to
     actually use one for the template because it might have disappeared when you come to test
     this template.  This is the form of the entity definition
     &lt;!ENTITY I-D.mrose-writing-rfcs SYSTEM
     "http://xml.resource.org/public/rfc/bibxml3/reference.I-D.mrose-writing-rfcs.xml">
     corresponding to a draft filename draft-mrose-writing-rfcs-nn.txt. The citation will be
     to the most recent draft in the sequence, and is updated roughly hourly on the web site.
     For working group drafts, the same principle applies: file name starts draft-ietf-wgname-..
     and entity file is reference.I-D.ietf-wgname-...  The corresponding entity name is
     I-D.ietf-wgname-... (I-D.mrose-writing-rfcs for the other example).  Of course this doesn't
     change when the draft version changes.
     -->
<!-- Fudge for XMLmind which doesn't have this built in -->
<!ENTITY nbsp    "&#160;">
]>

<!-- Extra statement used by XSLT processors to control the output style. -->
<?xml-stylesheet type='text/xsl' href='rfc2629.xslt' ?>


<!-- Processing Instructions can be placed here but if you are editing
     with XMLmind (and maybe other XML editors) they are better placed
     after the rfc element start tag as shown below. -->

<!-- Information about the document.
     category values: std, bcp, info, exp, and historic
     For Internet-Drafts, specify attribute "ipr".
     (ipr values are: full3667, noModification3667, noDerivatives3667),
     Also for Internet-Drafts, can specify values for
     attributes "docName" and, if relevant, "iprExtract".  Note
     that the value for iprExtract is the anchor attribute
     value of a section (such as a MIB specification) that can be
     extracted for separate publication, and is only
     useful whenhe value of "ipr" is not "full3667". -->
    <!-- TODO: verify which attributes are specified only
               by the RFC editor.  It appears that attributes
               "number", "obsoletes", "updates", and "seriesNo"
               are specified by the RFC editor (and not by
               the document author). -->
<rfc
    category="info"
    ipr="trust200902"
    docName="draft-bookham-rtgwg-nfix-arch-04" >
    <!-- Processing Instructions- PIs (for a complete list and description,
          see file http://xml.resource.org/authoring/README.html and below... -->

    <!-- Some of the more generally applicable PIs that most I-Ds might want to use -->

    <!-- Try to enforce the ID-nits conventions and DTD validity -->
    <?rfc strict="yes" ?>

    <!-- Items used when reviewing the document -->
    <?rfc comments="no" ?>  <!-- Controls display of <cref> elements -->
    <?rfc inline="no" ?>    <!-- When no, put comments at end in comments section,
                                 otherwise, put inline -->
    <?rfc editing="no" ?>   <!-- When yes, insert editing marks: editing marks consist of a
                                 string such as <29> printed in the blank line at the
                                 beginning of each paragraph of text. -->

    <!-- Create Table of Contents (ToC) and set some options for it.
         Note the ToC may be omitted for very short documents,but idnits insists on a ToC
         if the document has more than 15 pages. -->
   <?rfc toc="yes"?>
   <?rfc tocompact="yes"?> <!-- If "yes" eliminates blank lines before main section entries. -->
   <?rfc tocdepth="3"?>    <!-- Sets the number of levels of sections/subsections... in ToC -->

    <!-- Choose the options for the references.
         Some like symbolic tags in the references (and citations) and others prefer
         numbers. The RFC Editor always uses symbolic tags.
         The tags used are the anchor attributes of the references. -->
    <?rfc symrefs="yes"?>
    <?rfc sortrefs="no" ?> <!-- If "yes", causes the references to be sorted in order of tags.
                                 This doesn't have any effect unless symrefs is "yes" also. -->

    <!-- These two save paper: Just setting compact to "yes" makes savings by not starting each
         main section on a new page but does not omit the blank lines between list items.
         If subcompact is also "yes" the blank lines between list items are also omitted. -->
    <?rfc compact="yes" ?>
    <?rfc subcompact="no" ?>
    <!-- end of list of popular I-D processing instructions -->


    <!-- ***** FRONT MATTER ***** -->
<front>
    <!-- The abbreviated title is used in the page header - it is only necessary if the
         full title is longer than 42 characters -->
    <title abbrev="Network Function Interconnect">An Architecture for Network Function Interconnect</title>

    <!-- add 'role="editor"' below for the editors if appropriate -->
    <author
      role="editor"
      fullname="Colin Bookham"
      initials="C."
      surname="Bookham">
      <!-- abbrev not needed but can be used for the header
           if the full organization name is too long -->
      <organization>Nokia</organization>
      <address>
        <postal>
          <street>740 Waterside Drive</street>
          <city>Almondsbury, Bristol</city>
          <country>UK</country>
        </postal>
        <email>colin.bookham@nokia.com</email>
      </address>
    </author>

    <author
      fullname="Andrew Stone"
      initials="A."
      surname="Stone">
      <!-- abbrev not needed but can be used for the header
           if the full organization name is too long -->
      <organization abbrev="Nokia">Nokia</organization>
      <address>
        <postal>
          <street>600 March Road</street>
          <city>Kanata, Ontario</city>
          <country>Canada</country>
        </postal>
        <email>andrew.stone@nokia.com</email>
      </address>
    </author>

    <author
      fullname="Jeff Tantsura"
      initials="J."
      surname="Tantsura">
      <!-- abbrev not needed but can be used for the header
           if the full organization name is too long -->
      <organization abbrev="Microsoft">Microsoft</organization>
      <address>
        <postal>
          <street></street>
          <city></city>
          <country></country>
        </postal>
        <email>jefftant.ietf@gmail.com</email>
      </address>
    </author>

    <author
      fullname="Muhammad Durrani"
      initials="M."
      surname="Durrani">
      <!-- abbrev not needed but can be used for the header
           if the full organization name is too long -->
      <organization abbrev="Equinix Inc">Equinix Inc</organization>
      <address>
        <postal>
          <street>1188 Arques Ave</street>
          <city>Sunnyvale CA</city>
          <country>USA</country>
        </postal>
        <email>mdurrani@equinix.com</email>
      </address>
    </author>

    <author
      fullname="Bruno Decraene"
      initials="B."
      surname="Decraene">
      <!-- abbrev not needed but can be used for the header
           if the full organization name is too long -->
      <organization abbrev="Orange">Orange</organization>
      <address>
        <postal>
          <street>38-40 Rue de General Leclerc</street>
          <city>92794 Issey Moulineaux cedex 9</city>
          <country>France</country>
        </postal>
        <email>bruno.decraene@orange.com</email>
      </address>
    </author>

    <!-- Another author who claims to be an editor -->

    <date year="2022" /> 

    <area>Routing</area>

    <!-- WG name at the upperleft corner of the doc,
         IETF fine for individual submissions.  You can also
         omit this element in which case in defaults to "Network Working Group" -
         a hangover from the ancient history of the IETF! -->

    <workgroup>RTG Working Group</workgroup>

    <!-- The DTD allows multiple area and workgroup elements but only the first one has any
         effect on output.  -->
    <!-- You can add <keyword/> elements here.  They will be incorporated into HTML output
         files in a meta tag but they have no effect on text or nroff output. -->


    <abstract>
      <t>The emergence of technologies such as 5G, the Internet of Things
        (IoT), and Industry 4.0, coupled with the move towards network
        function virtualization, means that the service requirements demanded
        from networks are changing. This document describes an architecture
        for a Network Function Interconnect (NFIX) that allows for interworking
        of physical and virtual network functions in a unified and scalable
        manner across wide-area network and data center domains while
        maintaining the ability to deliver against SLAs.</t>
    </abstract>

    <note title="Requirements Language">
        <t>The key words &quot;MUST&quot;, &quot;MUST NOT&quot;,
        &quot;REQUIRED&quot;, &quot;SHALL&quot;, &quot;SHALL NOT&quot;,
        &quot;SHOULD&quot;, &quot;SHOULD NOT&quot;, &quot;RECOMMENDED&quot;,
        &quot;MAY&quot;, and &quot;OPTIONAL&quot; in this document are to be
        interpreted as described in BCP 14 <xref target="RFC2119"></xref><xref target="RFC8174"></xref>
        when, and only when, they appear in all capitals, as shown here.</t>
    </note>

</front>

<middle>
    <section title="Introduction">
      <t>With the introduction of technologies such as 5G, the Internet of
        Things (IoT), and Industry 4.0, service requirements are changing.
        In addition to the ever-increasing demand for more capacity, these
        services have other stringent service requirements that need to be
        met such as ultra-reliable and/or low-latency communication.</t>

      <t>Parallel to this, there is a continued trend to move towards network
        function virtualization. Operators are building digitalized
        infrastructure capable of hosting numerous virtualized network
        functions (VNFs). Infrastructure that can scale in and scale out
        depending on the application demand and can deliver flexibility and
        service velocity. Much of this virtualization activity is driven by
        the afore-mentioned emerging technologies as new infrastructure is
        deployed in support of them. To try and meet the new service
        requirements some of these VNFs are becoming more dispersed, so it is
        common for networks to have a mix of centralized medium- or large-sized
        sized data centers together with more distributed smaller ‘edge-clouds’.
        VNFs hosted within these data centers require seamless connectivity to
        each other, and to their existing physical network function (PNF)
        counterparts. This connectivity also needs to deliver against
        agreed SLAs.</t>

      <t>Coupled with the deployment of virtualization is automation. Many of
        these VNFs are deployed within SDN-enabled data centers where
        automation is simply a must-have capability to improve service
        activation lead-times. The expectation is that services will be
        instantiated in an abstract point-and-click manner and be automatically
        created by the underlying network, dynamically adapting to service
        connectivity changes as virtual entities move between hosts.</t>

      <t>This document describes an architecture for a Network Function
        Interconnect (NFIX) that allows for interworking of physical and
        virtual network functions in a unified and scalable manner. It
        describes a mechanism for establishing connectivity across multiple
        discreet domains in both the wide-area network (WAN) and the data
        center (DC) while maintaining the ability to deliver against SLAs. To
        achieve this NFIX works with the underlying topology to build a
        unified over-the-top topology.</t>

      <t>The NFIX architecture described in this document does not define
        any new protocols but rather outlines an architecture utilizing a
        collaboration of existing standards-based protocols.</t>


    </section>

    <section title='Terminology'>

      <t><list style="symbols">
      	<t>A physical network function (PNF) refers to a network device
          such as a Provider Edge (PE) router that connects physically to
          the wide-area network.</t>

        <t>A virtualized network function (VNF) refers to a network device
          such as a provider edge (PE) router that is hosted on an application
          server. The VNF may be bare-metal in that it consumes the entire
          resources of the server, or it may be one of numerous virtual
          functions instantiated as a VM or number of containers on a given
          server that is controlled by a hypervisor or container management
          platform.</t>

        <t>A Data Center Border (DCB) router refers to the network function that
          spans the border between the wide-area and the data center networks,
          typically interworking the different encapsulation techniques
          employed within each domain.</t>

        <t>An Interconnect controller is the controller responsible for
          managing the NFIX fabric and services.</t>

        <t>A DC controller is the term used for a controller that resides
          within an SDN-enabled data center and is responsible for the
          DC network(s)</t>
       </list></t>
    </section>

      <section title='Motivation'>
	      <t>Industrial automation and business-critical environments use
          applications that are demanding on the network. These applications
          present different requirements from low-latency to high-throughput,
          to application-specific traffic conditioning, or a combination.
          The evolution to 5G equally presents challenges for mobile back-,
          front- and mid-haul networks. The requirement for ultra-reliable
          low-latency communication means that operators need to re-evaluate
          their network architecture to meet these requirements.</t>

        <t>At the same time, the service edge is evolving. Where the service
          edge device was historically a PNF, the adoption of virtualization
          means VNFs are becoming more commonplace. Typically, these VNFs are
          hosted in some form of data center environment but require end-to-end
          connectivity to other VNFs and/or other PNFs. This represents a
          challenge because generally transport layer connectivity differs
          between the WAN and the data center environment. The WAN includes
          all levels of hierarchy (core, aggregation, access) that form the
          networks footprint, where transport layer connectivity using IP/MPLS
          is commonplace. In the data center native IP is commonplace,
          utilizing network virtualization overlay (NVO) technologies such
          as virtual extensible LAN (VXLAN) <xref target="RFC7348"></xref>, network virtualization
          using generic routing encapsulation (NVGRE) <xref target="RFC7637"></xref>, or generic
          network virtualization encapsulation (GENEVE)
          <xref target="I-D.ietf-nvo3-geneve"></xref>. There is a requirement to seamlessly
          integrate these islands and avoid heavy-lifting at interconnects as
          well as providing a means to provision end-to-end services with a
          single touch point at the edge.</t>

        <t>The service edge boundary is also changing. Some functions that
          were previously reasonably centralized are now becoming more
          distributed. One reason for this is to attempt to deal with low
          latency requirements. Another reason is that operators seek to
          reduce costs by deploying low/medium-capacity VNFs closer to the
          edge. Equally, virtualization also sees some of the access network
          moving towards the core. Examples of this include cloud-RAN or
          Software-Defined Access Networks.</t>

        <t>Historically service providers have architected data centers
          independently from the wide-area network, creating two independent
          domains or islands. As VNFs become part of the service landscape
          the service data-path must be extended across the WAN into the data
          center infrastructure, but in a manner that still allows operators
          to meet deterministic performance requirements. Methods for stitching
          WAN and DC infrastructures together with some form of
          service-interworking at the data center border have been
          implemented and deployed, but this service-interworking
          approach has several limitations:
          <list style="symbols">
            <t>The data center environment typically uses encapsulation
              techniques such as VXLAN or NVGRE while the WAN typically uses
              encapsulation techniques such as MPLS <xref target="RFC3031"></xref>. Underlying
              optical infrastructure might also need to be programmed.
              These are incompatible and require interworking at the service
              layer.</t>

            <t>It typically requires heavy-touch service provisioning on the
              data center border. In an end-to-end service, midpoint
              provisioning is undesirable and should be avoided.</t>

            <t>Automation is difficult; largely due to the first two points
              but with additional contributing factors. In the virtualization
              world automation is a must-have capability.</t>

            <t>When a service is operating at Layer 3 in a data center with
              redundant interconnects the risk of routing loops exists. There
              is no inherent loop avoidance mechanism when redistributing
              routes between address families so extreme care must be taken.
              Proposals such as the Domain Path (D-PATH) attribute
              <xref target="I-D.ietf-bess-evpn-ipvpn-interworking"></xref> attempt to address
              this issue but as yet are not widely implemented or deployed.</t>

            <t>Some or all the above make the service-interworking gateway
              cumbersome with questionable scaling attributes.</t>
          </list>
          </t>

          <t>Hence there is a requirement to create an open, scalable, and
            unified network architecture that brings together the wide-area
            network and data center domains. It is not an architecture e
            xclusively targeted at greenfield deployments, nor does it require
            a flag day upgrade to deploy in a brownfield network. It is an
            evolutionary step to a consolidated network that uses the
            constructs of seamless MPLS <xref target="I-D.ietf-mpls-seamless-mpls"></xref> as
            a baseline and extends upon that to include topologies that may not
            be link-state based and to provide end-to-end path control. Overall
            the NFIX architecture aims to deliver the following:

          <list style="symbols">
            <t>Allows for an evolving service edge boundary without having to
              constantly restructure the architecture.</t>

            <t>Provides a mechanism for providing seamless connectivity
              between VNF to VNF, VNF to PNF, and PNF to PNF, with
              deterministic SLAs, and with the ability to provide
              differentiated SLAs to suit different service requirements.</t>

            <t>Delivers a unified transport fabric using Segment Routing (SR)
              <xref target="RFC8402"></xref> where service delivery mandates touching only the
              service edge without imposing additional encapsulation
              requirements in the DC.</t>

            <t>Embraces automation by providing an environment where any
              end-to-end connectivity can be instantiated in a single
              request manner while maintaining SLAs.</t>
          </list></t>
    </section>

       <section title='Requirements'>
	       <t>The following section outlines the requirements that the proposed
           solution must meet. From an overall perspective, the proposed
           generic architecture must:</t>

          <t><list style="symbols">
            <t>Deliver end-to-end transport LSPs using traffic-engineering (TE)
              as required to meet appropriate SLAs for the service using(s)
              using those LSPs. End-to-end refers to VNF and/or PNF
              connectivity or a combination of both.</t>

            <t>Provide a solution that allows for optimal end-to-end path
              placement; where optimal not only meets the requirements of the
              path in question but also meets the global network objectives.</t>

            <t>Support varying types of VNF physical network attachment and
              logical (underlay/overlay) connectivity.</t>

            <t>Facilitate automation of service provision. As such the
              solution should avoid heavy-touch service provisioning and
              decapsulation/encapsulation at data center border routers.</t>

            <t>Provide a framework for delivering logical end-to-end networks
              using differentiated logical topologies and/or constraints.</t>

            <t>Provide a high level of stability; faults in one domain should
              not propagate to another domain.</t>

            <t>Provide a mechanism for homogeneous end-to-end OAM.</t>

            <t>Hide/localize instabilities in the different domains that
              participate in the end-to-end service.</t>

            <t>Provide a mechanism to minimize the label-stack depth
              required at path head-ends for SR-TE LSPs.</t>

            <t>Offer a high level of scalability.</t>

            <t>Although not considered in-scope of the current version of this
              document, the solution should not preclude the deployment of
              multicast. This subject may be covered in later versions of
              this document.</t>
          </list></t>
    </section>

    <section title='Theory of Operation'>
	      <t>This section describes the NFIX architecture including the
          building blocks and protocol machinery that is used to form the
          fabric. Where considered appropriate rationale is given for
          selection of an architectural component where other seemingly
          applicable choices could have been made.</t>

     <section title='VNF Assumptions'>

       <t>For the sake of simplicity, references to VNF are made in a
         broad sense. Equally, the differences between VNF and Container Network
		 Function (CNF) are largely immaterial for the purposes of this document,
		 therefore VNF is used to represent both. The way in which a VNF is instantiated
		 and provided network connectivity will differ based on environment and
		 VNF capability, but for conciseness this is not explicitly detailed
         with every reference to a VNF. Common examples of VNF variants
         include but are not limited to:</t>

      <t><list style="symbols">
        <t>A VNF that functions as a routing device and has full IP routing
          and MPLS capabilities. It can be connected simultaneously to the data
          center fabric underlay and overlay and serves as the NVO tunnel
          endpoint <xref target="RFC8014"></xref>. Examples of this might be a virtualized PE
          router, or a virtualized Broadband Network Gateway (BNG).</t>

        <t>A VNF that functions as a device (host or router) with limited IP
          routing capability. It does not connect directly to the data center
          fabric underlay but rather connects to one or more external physical
          or virtual devices that serve as the NVO tunnel endpoint(s). It may
          however have single or multiple connections to the overlay. Examples
          of this might be a mobile network control or management plane
          function.</t>

        <t>A VNF that has no routing capability. It is a virtualized function
          hosted within an application server and is managed by a hypervisor
          or container host. The hypervisor/container host acts as the NVO
          endpoint and interfaces to some form of SDN controller responsible
          for programming the forwarding plane of the virtualization host
          using, for example, OpenFlow. Examples of this might be an
          Enterprise application server or a web server running as a virtual
		  machine and front-ended by a virtual routing function such as
		  OVS/xVRS/VTF.</t>
      </list></t>

      <t>Where considered necessary exceptions to the examples provided
        above or focus on a particular scenario will be highlighted.</t>

     </section>

     <section title='Overview'>
       <t>The NFIX architecture makes no assumptions about how the network is
         physically composed, nor does it impose any dependencies upon it.
         It also makes no assumptions about IGP hierarchies and the use of
         areas/levels or discrete IGP instances within the WAN is fully
         endorsed to enhance scalability and constrain fault propagation.
		 This could apply for instance to a hierarchical WAN from core to
		 edge or from WAN to LAN connections. The overall architecture uses
		 the constructs of seamless MPLS as a baseline and extends upon that.
		 The concept of decomposing the network into multiple domains is one
		 that has been widely deployed and has been proven to scale in
		 networks with large numbers of nodes.</t>

       <t>The proposed architecture uses segment routing (SR) as its
         preferred choice of transport. Segment routing is chosen for
         construction of end-to-end LSPs given its ability to traffic-engineer
         through source-routing while concurrently scaling exceptionally well
         due to its lack of network state other than the ingress node. This
         document uses SR instantiated on an MPLS forwarding plane(SR-MPLS),
         although it does not preclude the use of SRv6 either now or at some
         point in the future. The rationale for selecting SR-MPLS is simply
         maturity and more widespread applicability across a potentially broad
         range of network devices. This document may be updated in future
         versions to include more description of SRv6 applicability.</t>
     </section>

     <section title='Use of a Centralized Controller'>

       <t>It is recognized that for most operators the move towards the use
         of a controller within the wide-area network is a significant change
         in operating model. In the NFIX architecture it is a necessary
         component. Its use is not simply to offload inter-domain path
         calculation from network elements; it provides many more benefits:</t>


       <t><list style="symbols">
        <t>It offers the ability to enforce constraints on paths that
           originate/terminate on different network elements, thereby providing
           path diversity, and/or bidirectionality/co-routing, and/or
           disjointness.</t>

        <t>It avoids collisions, re-tries, and packing problems that has been
          observed in networks using distributed TE path calculation, where
          head-ends make autonomous decisions.</t>

        <t>A controller can take a global view of path placement strategies,
          including the ability to make path placement decisions over a high
          number of LSPs concurrently as opposed to considering each LSP
          independently. In turn, this allows for ‘global’ optimization of
          network resources such as available capacity.</t>

        <t>A controller can make decisions based on near-real-time network
          state and optimize paths accordingly. For example, if a network link
          becomes congested it may recompute some of the paths transiting that
          link to other links that may not be quite as optimal but do have
          available capacity. Or if a link latency crosses a certain threshold,
          it may select to reoptimize some latency-sensitive paths away from
          that link.</t>

        <t>The logic of a controller can be extended beyond pure path
          computation and placement. If the controller is aware of services,
          service requirements, and available paths within the network it can
          cross-correlate between them and ensure that the appropriate paths
          are used for the appropriate services.</t>

        <t>The controller can provide assurance and verification of the
          underlying SLA provided to a given service.</t>
       </list></t>

       <t>As the main objective of the NFIX architecture is to unify the data
         center and wide-area network domains, using the term controller is not
         sufficiently succinct. The centralized controller may need to
         interface to other controllers that potentially reside within an
         SDN-enabled data center. Therefore, to avoid interchangeably using
         the term controller for both functions, we distinguish between them
         simply by using the terms ‘DC controller’ which as the name suggests
         is responsible for the DC, and ‘Interconnect controller’ responsible
         for managing the extended SR fabric and services.</t>

         <t>The Interconnect controller learns wide-area network topology
           information and allocation of segment routing SIDs within that
           domain using BGP link-state <xref target="RFC7752"></xref> with appropriate SR extensions.
           Equally it learns data center topology information and Prefix-SID
           allocation using BGP labeled unicast <xref target="RFC8277"></xref> with appropriate SR
           extensions, or BGP link-state if a link-state IGP is used within the
           data center. If Route-Reflection is used for exchange of BGP
           link-state or labeled unicast NLRI within one or more domains, then
           the Interconnect controller need only peer as a client with those
           Route-Reflectors in order to learn topology information.</t>

        <t>Where BGP link-state is used to learn the topology of a data center
          (or any IGP routing domain) the BGP-LS Instance Identifier
          (Instance-ID) is carried within Node/Link/Prefix NLRI and is used
          to identify a given IGP routing domain. Where labeled unicast BGP
          is used to discover the topology of one or more data center domains
          there is no equivalent way for the Interconnect controller to achieve
          a level of routing domain correlation. The controller may learn some
          splintered connectivity map consisting of 10 leaf switches, four
          spine switches, and four DCB’s, but it needs some form of key to
          inform it that leaf switches 1-5, spine switches 1 and 2, and DCB’s 1
          and 2 belong to data center 1, while leaf switches 6-10, spine
          switches 3 and 4, and DCB’s 3 and 4 belong to data center 2. What is
          needed is a form of ‘data center membership identification’ to
          provide this correlation. Optionally this could be achieved at BGP
          level using a standard community to represent each data center, or
          it could be done at a more abstract level where for example the DC
          controller provides the membership identification to the Interconnect
          controller through an application programming interface (API).</t>

        <t>Understanding real-time network state is an important part of the
          Interconnect controllers role, and only with this information is the
          controller able to make informed decisions and take preventive or
          corrective actions as necessary. There are numerous methods
          implemented and deployed that allow for harvesting of network
          state, including (but not limited to) IPFIX <xref target="RFC7011"></xref>,
		  Netconf/YANG <xref target="RFC6241"></xref><xref target="RFC6020"></xref>,
		  streaming telemetry, BGP link-state <xref target="RFC7752"></xref>
		  <xref target="I-D.ietf-idr-te-lsp-distribution"></xref>, and the
		  BGP Monitoring Protocol (BMP) <xref target="RFC7854"></xref>.</t>

     </section>

     <section title='Routing and LSP Underlay'>

       <t>This section describes the mechanisms and protocols that are used to
         establish end-to-end LSPs; where end-to-end refers to VNF-to-VNF,
		 PNF-to-PNF, or VNF-to-PNF.</t>

         <section title='Intra-Domain Routing'>

           <t>In a seamless MPLS architecture domains are based on geographic
             dispersion (core, aggregation, access).  Within this document a
             domain is considered as any entity with a captive topology; be it
             a link-state topology or otherwise. Where reference is made to the
             wide-area network domain, it refers to one or more domains that
             constitute the wide-area network domain.</t>

          <t>This section discusses the basic building blocks required within
            the wide-area network and the data center, noting from above that
            the wide-area network may itself consist of multiple domains.</t>

            <section title='Wide-Area Network Domains'>

              <t>The wide-area network includes all levels of hierarchy
                (core, aggregation, access) that constitute the networks MPLS
                footprint as well as the data Center border routers.
                Each domain that constitutes part of the wide-area network
                runs a link-state interior gateway protocol (IGP) such as
                ISIS or OSPF, and each domain may use IGP-inherent hierarchy
                (OSPF areas, ISIS levels) with an assumption that visibility
                is domain-wide using, for example, L2 to L1 redistribution.
                Alternatively, or additionally, there may be multiple domains
                that are split by using separate and distinct instances of IGP.
                There is no requirement for IGP redistribution of any link or
                loopback addresses between domains.</t>

              <t>Each IGP should be enabled with the relevant extensions for
                segment routing <xref target="RFC8667"></xref><xref target="RFC8665"></xref>, and each SR-capable router
                should advertise a Node-SID for its loopback address, and an
                Adjacency-SID (Adj-SID) for every connected interface
                (unidirectional adjacency) belonging to the SR domain. SR
                Global Blocks (SRGB) can be allocated to each domain as deemed
                appropriate to specific network requirements. Border routers
                belonging to multiple domains have an SRGB for each domain.</t>

              <t>The default forwarding path for intra-domain LSPs
                that do not require TE is simply an SR LSP containing a single
                label advertised by the destination as a Node-SID and
                representing the ECMP-aware shortest path to that destination.
                Intra-domain TE LSPs are constructed as required by
                the Interconnect controller. Once a path is calculated it is
                advertised as an explicit SR Policy
                <xref target="I-D.ietf-spring-segment-routing-policy"></xref> containing one
                or more paths expressed as one or more segment-lists, which may
				optionally contain binding SIDs if requirements dictate. An
                SR Policy is identified through the tuple
                [headend, color, endpoint] and this tuple is used extensively
                by the Interconnect controller to associate services with an
                underlying SR Policy that meets its objectives.</t>
				
              <t>To provide support for ECMP the Entropy Label <xref target="RFC6790"></xref><xref target="RFC8662"></xref>
                should be utilized. Entropy Label Capability (ELC) should be 
                advertised into the IGP using the IS-IS Prefix Attributes TLV
                <xref target="I-D.ietf-isis-mpls-elc"></xref> or the OSPF Extended
				Prefix TLV <xref target="I-D.ietf-ospf-mpls-elc"></xref> coupled
				with the Node MSD Capability sub-TLV to advertise Entropy Readable
				Label Depth (ERLD) <xref target="RFC8491"></xref><xref target="RFC8476"></xref>
				and the base MPLS Imposition (BMI). Equally, support for ELC together
				with the supported ERLD should be signaled in BGP using the BGP
				Next-Hop Capability <xref target="I-D.ietf-idr-next-hop-capability"></xref>.
				Ingress nodes and or DCBs should ensure sufficient entropy is applied
				to packets to exercise available ECMP links.</t>                				
			  
            </section>

            <section title='Data Center Domain'>

              <t>The data center domain includes all fabric switches, network
                virtualization edge (NVE), and the data center border routers.
                The data center routing design may align with the framework of
                <xref target="RFC7938"></xref> running eBGP single-hop sessions established over
                direct point-to-point links, or it may use an IGP for
                dissemination of topology information. This document focuses on the
				former, simply because the ue of an IGP largely makes the data centers
				behaviour analogous to that of a wide-area network domain.</t>

              <t>The chosen method of transport or encapsulation within the
                data center for NFIX is SR-MPLS over IP/UDP <xref target="RFC8663"></xref> or,
                where possible, native SR-MPLS. The choice of SR-MPLS over
                IP/UDP or native SR-MPLS allows for good entropy to maximize
                the use of equal-cost Clos fabric links. Native SR-MPLS
				encapsulation provides entropy through use of the Entropy Label,
				and, like the wide-area network, support for ELC together with the
				support ERLD should be signaled using the BGP Next-Hop Capability
				attribute. As described in <xref target="RFC6790"></xref> the ELC
				is an indication from the egress node of an MPLS tunnel to the
				ingress node of the MPLS tunnel that is is capable of processing
				an Entropy Label. The BGP Next-Hop Capability is a non-transitive
				attribute which is modified or deleted when the next-hop is 
				changed to reflect the capabilities of the new next-hop. If we
				assume that the path of a BGP-signaled LSP transits through 
				multiple ASNs, and/or a single ASN with multiple next-hops, then
				it is not possible for the ingress node to determine the ELC
				of the egress node. Without this end-to-end signaling capability
				the entropy label must only be used when it is explicitly known,
				through configuration or other means, that the egress node has
				support for it. Entropy for SR-MPLS over IP/UDP encapsulation
				uses the source UDP port for IPv4 and the Flow Label for IPv6.
				Again, the ingress network function should ensure sufficient
				entropy is applied to exercise available ECMP links.</t>
				
              <t>Another significant advantage of the use of native SR-MPLS or
			    SR-MPLS over IP/UDP is that it allows for a lightweight interworking
			    function at the DCB without the requirement for midpoint provisioning;
			    interworking between the data center and the wide-area network
			    domains becomes an MPLS label swap/continue action.</t>				

              <t>Loopback addresses of network elements within the data center are
                advertised using labeled unicast BGP with the addition of SR
                Prefix SID extensions <xref target="RFC8669"></xref> containing a
				globally unique and persistent Prefix-SID. The data-plane encapsulation
				of SR-MPLS over IP/UDP or native SR-MPLS allows network elements
                within the data center to consume BGP Prefix-SIDs and
                legitimately use those in the encapsulation.</t>

            </section>

         </section>

         <section title='Inter-Domain Routing'>

           <t>Inter-domain routing is responsible for establishing connectivity
             between any domains that form the wide-area network, and between
             the wide-area network and data center domains. It is considered
             unlikely that every end-to-end LSP will require a TE path, hence
             there is a requirement for a default end-to-end forwarding path.
             This default forwarding path may also become the path of last
             resort in the event of a non-recoverable failure of a TE path.
             Similar to the seamless MPLS architecture this inter-domain MPLS
             connectivity is realized using labeled unicast BGP [RFC8277] with
             the addition of SR Prefix SID extensions.</t>

          <t>Within each wide-area network domain all service edge routers,
            DCBs, and ABRs/ASBRs form part of the labeled BGP mesh, which can
            be either full-mesh, or more likely based on the use of
            route-reflection. Each of these routers advertises its respective
            loopback addresses into labeled BGP together with an MPLS label
            and a globally unique Prefix-SID. Routes are advertised between
            wide-area network domains by ABRs/ASBRs that impose next-hop-self
            on advertised routes. The function of imposing next-hop-self for
            labeled routes means that the ABR/ASBR allocates a new label for
            advertised routes and programs a label-swap entry in the
            forwarding plane for received and advertised routes. In short it
            becomes part of the forwarding path.</t>

          <t>DCB routers have labeled BGP sessions towards the wide-area
            network and labeled BGP sessions towards the data center.
            Routes are bidirectionally advertised between the domains
            subject to policy, with the DCB imposing itself as next-hop
            on advertised routes. As above, the function of imposing
            next-hop-self for labeled routes implies allocation of a new
            label for advertised routes and a label-swap entry being programmed
            in the forwarding plane for received and advertised labels.
            The DCB thereafter becomes the anchor point between the wide-area
            network domain and the data center domain.</t>

          <t>Within the wide-area network next-hops for labeled unicast
            routes containing Prefix-SIDs are resolved to SR LSPs, and within
            the data center domain next-hops for labeled unicast routes
            containing Prefix-SIDs are resolved to SR LSPs or IP/UDP tunnels.
            This provides end-to-end connectivity without a traffic-engineering
            capability.</t>

          <figure  anchor="figure1" align="center">
                 <artwork>
                   <![CDATA[

      +---------------+   +----------------+   +---------------+
      |  Data Center  |   |   Wide-Area    |   |   Wide-Area   |
      |              +-----+   Domain 1   +-----+  Domain ‘n’  |
      |              | DCB |              | ABR |              |
      |              +-----+              +-----+              |
      |               |   |                |   |               |
      +---------------+   +----------------+   +---------------+
      <-- SR/SRoUDP -->   <---- IGP/SR ---->   <--- IGP/SR ---->
      <--- BGP-LU ---> NHS <--- BGP-LU ---> NHS <--- BGP-LU --->
                   ]]>
                 </artwork>
                 <postamble>Default Inter-Domain Forwarding Path</postamble>
             </figure>

         </section>

         <section title='Intra-Domain and Inter-Domain Traffic-Engineering'>

           <t>The capability to traffic-engineer intra- and inter-domain
             end-to-end paths is considered a key requirement in order to meet
             the service objectives previously outlined. To achieve optimal
             end-to-end path placement the key components to be considered are
             path calculation, path activation, and FEC-to-path binding
             procedures.</t>

          <t>In the NFIX architecture end-to-end path calculation is performed
            by the Interconnect controller. The mechanics of how the objectives
            of each path is calculated is beyond the scope of this document.
            Once a path is calculated based upon its objectives and
            constraints, the path is advertised from the controller to the
            LSP headend as an explicit SR Policy containing one or more paths
            expressed as one or more segment-lists. An SR Policy is identified
            through the tuple [headend, color, endpoint] and this tuple is used
            extensively by the Interconnect controller to associate services
            with an underlying SR Policy that meets its objectives.</t>

          <t>The segment-list of an SR Policy encodes a source-routed path
            towards the endpoint. When calculating the segment-list the
            Interconnect controller makes comprehensive use of the
            Binding-SID (BSID), instantiating BSID anchors as necessary at path
            midpoints when calculating and activating a path. The use of BSID
            is considered fundamental to segment routing as described in 
			<xref target="I-D.filsfils-spring-sr-policy-considerations"></xref>.
			It provides opacity between domains, ensuring that any segment
			churn is constrained to a single domain. It also reduces the number
			of segments/labels that the headend needs to impose, which is
            particularly important given that network elements within a data
            center generally have limited label imposition capabilities.
            In the context of the NFIX architecture it is also the vehicle
            that allows for removal of heavy midpoint provisioning at the DCB.</t>

          <t>For example, assume that VNF1 is situated in data center 1, which
            is interconnected to the wide-area network via DCB1. VNF1 requires
            connectivity to VNF2, situated in data center 2, which is
            interconnected to the wide-area network via DCB2. Assuming there
            is no existing TE path that meet VNF1’s requirements, the
            Interconnect controller will:</t>

          <t><list style="symbols">
            <t>Instantiate an SR Policy on DCB1 with BSID n and a
              segment-list containing the relevant segments of a TE path
              to DCB2. DCB1 therefore becomes a BSID anchor.</t>

            <t>Instantiate an SR Policy on VNF1 with BSID m and a segment-list
              containing segments {DCB1, n, VNF2}.</t>
          </list></t>

          <figure  anchor="figure2" align="center">
                 <artwork>
                   <![CDATA[

       +---------------+  +----------------+  +---------------+
       | Data Center 1 |  |   Wide-Area    |  | Data Center 2 |
       | +----+       +----+      3       +----+       +----+ |
       | |VNF1|       |DCB1|-1   / \   5--|DCB2|       |VNF2| |
       | +----+       +----+  \ /   \ /   +----+       +----+ |
       |               |  |    2     4     |  |               |
       +---------------+  +----------------+  +---------------+
       SR Policy      SR Policy
       BSID m         BSID n
      {DCB1,n,VNF2} {1,2,3,4,5,DCB2}
                   ]]>
                 </artwork>
                 <postamble>Traffic-Engineered Path using BSID</postamble>
             </figure>

           <t>In the above figure a single DCB is used to interconnect two
		     domains. Similarly, in the case of two wide-area domains the DCB
			 would be represented as an ABR or ASBR. In some single operator
			 environments domains may be interconnected using adjacent ASBRs
			 connected via a distinct physical link. In this scenario the
			 procedures outlined above may be extended to incorporate the 
			 mechanisms used in Egress Peer Engineering (EPE) <xref target="I-D.ietf-spring-segment-routing-central-epe"></xref>
             to form a traffic-engineered path spanning distinct domains.</t>


            <section title='Traffic-Engineering and ECMP'>


           <t>Where the Interconnect controller is used to place SR policies,
		     providing support for ECMP requires some consideration. An SR
			 Policy is described with one or more segment-lists, end each of
			 those segment-lists may or may not provide ECMP as a sum instruction
			 and each SID itself may or may not support ECMP forwarding. When
			 an individual SID is a BSID, an ECMP path may or may not also be
			 nested within. The Interconnect controller may choose to place a
			 path consisting entirely of non-ECMP-aware Adj-SIDs (each SID
			 representing a single adjacency) such that the controller has explicit
			 hop-by-hop knowledge of where that SR-TE LSP is routed. This is
			 beneficial to allow the controller to take corrective action if the
			 criteria that was used to initially select a particular link in a
			 particular path subsequently changes. For example, if the latency
			 of a link increases or a link becomes congested and a path should
			 be rerouted. If ECMP-aware SIDs are used in the SR policy segment-list
			 (including Node-SIDs, Adj-SIDs representing parallel links, and Anycast
			 SIDs) SR routers are able to make autonomous decisions about where
			 traffic is forwarded. As a result, it is not possible for the controller
			 to fully understand the impact of a change in network state and react
			 to it. With this in mind there are a number of approaches that could
			 be adopted:</t>

           <t><list style="symbols">
            <t>If there is no requirement for the Interconnect controller to
			explicitly track path on a hop-by-hop basis, ECMP-aware SIDs may be
			used in the SR policy segment-list. This approach may require multiple
			[ELI, EL] pairs to be inserted at the ingress node; for example,
			above and below a BSID to provide entropy in multiple domains.</t>

			<t>If there is a requirement for the Interconnect controller to
			explicitly track paths on a hop-by-hop to provide the capability
			to reroute them based on changes in network state, SR policy
			segment-lists should be constructed of non-ECMP-aware Adj-SIDs.</t>

			<t>A hybrid approach that allows for a level of ECMP (at the
			headend) together with the ability for the Interconnect controller
			to  explicitly track paths is to instantiate an SR policy consisting
			of a set of segment-lists, each containing non-ECMP-aware Adj-SIDs.
			Each segment-list will be assigned a weight to allow for ECMP or
			UCMP. This approach does however imply computation and programing
			of two paths instead of one.</t>

			<t>Another hybrid approach might work as follows. Redundant DCBs
			advertise an Anycast-SID ‘A’ into the data center, and also
			instantiate an SR policy with a segment-list consisting of
			non-ECMP-aware Adj-SIDs meeting the required connectivity and
			SLA. The BSID value of this SR policy ‘B’ must be common to both
			redundant DCBs, but the calculated paths are diverse. Indeed,
			multiple segment-lists could be used in this SR policy. A VNF
			could then instantiate an SR policy with a segment-list of
			{A, B} to achieve ECMP in the data center and TE in the wide-area
			network with the option of ECMP at the BSID anchor</t>
           </list></t>
            </section>
         </section>
     </section>

     <section title='Service Layer'>

       <t>The service layer is intended to deliver Layer 2 and/or Layer 3 VPN
         connectivity between network functions to create an overlay utilizing
         the routing and LSP underlay described in section 5.4. To do this the
		 solution employs the EVPN and/or VPN-IPv4/IPv6 address families to
		 exchange Layer 2 and Layer 3 Network Layer Reachability Information
		 (NLRI). When these NLRI are exchanged between domains it is typical
		 for the border router to set next-hop-self on advertised routes. With
		 the proposed routing and LSP underlay however, this is not required
		 and EVPN/VPN-IPv4/IPv6 routes should be passed end-to-end without
         transit routers modifying the next-hop attribute.</t>

      <t>Section 5.4.2 describes the use of labeled unicast BGP to exchange
        inter-domain routes to establish a default forwarding path.
        Labeled-unicast BGP is used to exchange prefix reachability between
        service edge routers, with domain border routes imposing next-hop-self
        on routes advertised between domains. This provides a default
        inter-domain forwarding path and provides the required connectivity
        to establish inter-domain BGP sessions between service edges for the
        exchange of EVPN and/or VPN-IPv4/IPv6 NLRI. If route-reflection is
        used for the EVPN and/or VPN-IPv4/IPv6 address families within one
        or more domains, it may be desirable to create inter-domain BGP
        sessions between route-reflectors. In this case the peering addresses
        of the route-reflectors should also be exchanged between domains using
        labeled unicast BGP. This creates a connectivity model analogous to
        BGP/MPLS IP-VPN Inter-AS option C <xref target="RFC4364"></xref>.</t>

        <figure anchor="figure3" align='center'>
               <artwork>

                 <![CDATA[

         +----------------+  +----------------+  +----------------+
         |     +----+     |  |     +----+     |  |     +----+     |
       +----+  | RR |    +----+    | RR |    +----+    | RR |   +----+
       | NF |  +----+    | DCI|    +----+    | DCI|    +----+   | NF |
       +----+            +----+              +----+             +----+
         |     Domain     |  |     Domain     |  |     Domain     |
         +----------------+  +----------------+  +----------------+
         <-------> <-----> NHS <-- BGP-LU ---> NHS <-----> <------>
         <-------> <--------- EVPN/VPN-IPv4/v6 ----------> <------>
                 ]]>
               </artwork>
               <postamble>Inter-Domain Service Layer</postamble>
           </figure>

           <t>EVPN and/or VPN-IPv4/v6 routes received from a peer in a
             different domain will contain a next-hop equivalent to the router
             that sourced the route. The next-hop of these routes can be
             resolved to labeled-unicast route (default forwarding path) or
             to an SR policy (traffic-engineered forwarding path) as appropriate
             to the service requirements. The exchange of EVPN and/or
             VPN-IPv4/IPv6 routes in this manner implies that
             Route-Distinguisher and Route-Target values remain intact
             end-to-end.</t>

          <t>The use of end-to-end EVPN and/or VPN-IPv4/IPv6 address families
            without the imposition of next-hop-self at border routers
            complements the gateway-less transport layer architecture.
            It negates the requirement for midpoint service provisioning
            and as such provides the following benefits:</t>

          <t><list style="symbols">
            <t>Avoids the translation of MAC/IP EVPN routes to IP-VPN
              routes (and vice versa) that is typically associated with
              service interworking.</t>

            <t>Avoids instantiation of MAC-VRFs and IP-VPNs for each tenant
              resident in the DCB.</t>

            <t>Avoids provisioning of demarcation functions between the data
              center and wide-area network such as QoS, access-control,
              aggregation and isolation.</t>
            </list></t>
     </section>

     <section title='Service Differentiation'>

       <t>As discussed in section 5.4.3, the use of TE paths is a key
         capability of the NFIX solution framework described in this document.
         The Interconnect controller computes end-to-end TE paths between
         NFs and programs DC nodes, DCBs, ABR/ASBRs, via SR Policy, with the
         necessary label forwarding entries for each [headend, color, endpoint].
         The collection of [headend, endpoint] pairs for the same color
         constitutes a logical network topology, where each topology satisfies
         a given SLA requirement.</t>

        <t>The Interconnect controller discovers the endpoints associated
          to a given topology (color) upon the reception of EVPN or IPVPN
          routes advertised by the endpoint. The EVPN and IPVPN NLRIs are
          advertised by the endpoint nodes along with a color extended
          community which identifies the topology to which the owner of the
          NLRI belongs. At a coarse level all the EVPN/IPVPN routes of the
          same VPN can be advertised with the same color, and therefore a
          TE topology would be established on a per-VPN basis. At a more
          granular level IPVPN and especially EVPN provide a more granular
          way of coloring routes, that will allow the Interconnect controller
          to associate multiple topologies to the same VPN. For example:</t>

        <t><list style="symbols">
          <t>All the EVPN MAC/IP routes for a given VNF may be advertised with
            the same color. This would allow the Interconnect controller to
            associate topologies per VNF within the same VPN; that is, VNF1
            could be blue (e.g., low-latency topology) and VNF2 could be
            green (e.g., high-throughput).</t>

          <t>The EVPN MAC/IP routes and Inclusive Multicast Ethernet Tag (IMET)
            route for VNF1 may be advertised with different colors, e.g.,
            red and brown, respectively. This would allow the association
            of e.g., a low-latency topology for unicast traffic to VNF1 and
            best-effort topology for BUM traffic to VNF1.</t>

          <t>Each EVPN MAC/IP route or IP-Prefix route from a given VNF may
            be advertised with different color. This would allow the
            association of topologies at the host level or host route
            granularity.</t>
        </list></t>
     </section>

     <section title='Automated Service Activation'>
       <t>The automation of network and service connectivity for instantiation
         and mobility of virtual machines is a highly desirable attribute
         within data centers. Since this concerns service connectivity, it
         should be clear that this automation is relevant to virtual functions
         that belong to a service as opposed to a virtual network function that
         delivers services, such as a virtual PE router.</t>

      <t>Within an SDN-enabled data center, a typical hierarchy from top to
        bottom would include a policy engine (or policy repository), one or
        more DC controllers, numerous hypervisors/container hosts that function
        as NVO endpoints, and finally the virtual machines(VMs)/containers,
        which we’ll refer to generically as virtualization hosts.</t>

      <t>The mechanisms used to communicate between the policy engine and DC
        controller, and between the DC controller and hypervisor/container are
        not relevant here and as such they are not discussed further. What is
        important is the interface and information exchange between the
        Interconnect controller and the data center SDN functions:</t>

      <t><list style="symbols">
        <t>The Interconnect controller interfaces with the data center policy
          engine and publishes the available colors, where each color
          represents a topological service connectivity map that meets a set
          of constraints and SLA objectives. This interface is a
          straightforward API.</t>

        <t>The Interconnect controller interfaces with the DC controller to
          learn overlay routes. This interface is BGP and uses the EVPN
          Address Family.</t>
      </list></t>

      <t>With the above framework in place, automation of network and
        service connectivity can be implemented as follows:</t>

      <t><list style="symbols">
        <t>The virtualization host is turned-up. The NVO endpoint notifies
          the DC controller of the startup.</t>

        <t>The DC controller retrieves service information, IP addressing
          information, and service ‘color’ for the virtualization host from
          the policy engine. The DC controller subsequently programs the
          associated forwarding information on the virtualization host.
          Since the DC controller is now aware of MAC and IP address
          information for the virtualization host, it advertises that
          information as an EVPN MAC Advertisement Route into the overlay.</t>

        <t>The Interconnect controller receives the EVPN MAC Advertisement
          Route (potentially via a Route-Reflector) and correlates it with
          locally held service information and SLA requirements using Route
          Target and Color communities. If the relevant SR policies are not
          already in place to support the service requirements and logical
          connectivity, including any binding-SIDs, they are calculated and
          advertised to the relevant headends.</t>
      </list></t>

      <t>The same automated service activation principles can also be used to
        support the scenario where virtualization hosts are moved between
        hypervisors/container hosts for resourcing or other reasons. We
        refer to this simply as mobility. If a virtualization host is turned
        down the parent NVO endpoint notifies the DC controller, which in
        turn notifies the policy engine and withdraws any EVPN MAC
        Advertisement Routes. Thereafter all associated state is removed. When
        the virtualization host is turned up on a different
        hypervisor/container host, the automated service connectivity
        process outlined above is simply repeated.</t>
     </section>

     <section title='Service Function Chaining'>
       <t>Service Function Chaining (SFC) defines an ordered set of abstract
         service functions and the subsequent steering of traffic through them.
         Packets are classified at ingress for processing by the required set
         of service functions (SFs) in an SFC-capable domain and are then
         forwarded through each SF in turn for processing. The ability to
         dynamically construct SFCs containing the relevant SFs in the right
         sequence is a key requirement for operators.</t>

        <t>To enable flexible service function deployment models that support
          agile service insertion the NFIX architecture adopts the use of BGP
          as the control plane to distribute SFC information. The BGP control
          plane for Network Service Header (NSH) SFC
          <xref target="I-D.ietf-bess-nsh-bgp-control-plane"></xref> is used for this purpose
          and defines two route types; the Service Function Instance Route
          (SFIR) and the Service Function Path Route (SFPR).</t>

        <t>The SFIR is used to advertise the presence of a service function
          instance (SFI) as a function type (i.e. firewall, TCP optimizer)
          and is advertised by the node hosting that SFI. The SFIR is
          advertised together with a BGP Tunnel Encapsulation attribute
          containing details of how to reach that particular service function
          through the underlay network (i.e. IP address and encapsulation
          information).</t>

        <t>The SFPRs contain service function path (SFP) information and
          one SFPR is originated for each SFP. Each SFPR contains the service
          path identifier (SPI) of the path, the sequence of service function
          types that make up the path (each of which has at least one instance
          advertised in an SFIR), and the service index (SI) for each listed
          service function to identify its position in the path.</t>

        <t>Once a Classifier has determined which flows should be mapped to
          a given SFP, it imposes an NSH <xref target="RFC8300"></xref> on those packets, setting
          the SPI to that of the selected service path (advertised in an SFPR),
          and the SI to the first hop in the path. As NSH is encapsulation
          agnostic, the NSH encapsulated packet is then forwarded through the
          appropriate tunnel to reach the service function forwarder (SFF)
          supporting that service function instance (advertised in an SFIR).
          The SFF removes the tunnel encapsulation and forwards the packet
          with the NSH to the relevant SF based upon a lookup of the SPI/SI.
          When it is returned from the SF with a decremented SI value, the SFF
          forwards the packet to the next hop in the SFP using the tunnel
          information advertised by that SFI. This procedure is repeated
          until the last hop of the SFP is reached.</t>

        <t>The use of the NSH in this manner allows for service chaining
          with topological and transport independence. It also allows for the
          deployment of SFIs in a condensed or dispersed fashion depending on
          operator preference or resource availability. Service function chains
          are built in their own overlay network and share a common underlay
          network, where that common underlay network is the NFIX fabric
          described in section 5.4.  BGP updates containing an SFIR or
          SFPR are advertised in conjunction with one or more Route
          Targets (RTs), and each node in a service function overlay network
          is configured with one or more import RTs. As a result, nodes will
          only import routes that are applicable and that local policy dictates.
          This provides the ability to support multiple service function
          overlay networks or the construction of service function chains
          within L3VPN or EVPN services.</t>

        <t>Although SFCs are constructed in a unidirectional manner, the BGP
          control plane for NSH SFC allows for the optional association of
          multiple paths (SFPRs). This provides the ability to construct a
          bidirectional service function chain in the presence of multiple
          equal-cost paths between source and destination to avoid problems
          that SFs may suffer with traffic asymmetry.</t>

        <t>The proposed SFC model can be considered decoupled in that the use
          of SR as a transport between SFFs is completely independent of the
          use of NSH to define the SFC. That is, it uses an NSH-based SFC and
          SR is just one of many encapsulations that could be used between
          SFFs. A similar more integrated approach proposes encoding a
          service function as a segment so that an SFC can be constructed as
          a segment-list. In this case it can be considered an SR-based SFC
          with an NSH-based service plane since the SF is unaware of the
          presence of the SR. Functionally both approaches are very similar
          and as such both could be adopted and could work in parallel.
          Construction of SFCs based purely on SR (SF is SR-aware) are not
          considered at this time.</t>

     </section>

     <section title='Stability and Availability'>
       <t>Any network architecture should have the capability to self-restore
         following the failure of a network element. The time to reconverge
         following the failure needs to be minimal to avoid evident
         disruptions in service. This section discusses protection mechanisms
         that are available for use and their applicability to the proposed
         architecture.</t>

        <section title='IGP Reconvergence'>
          <t>Within the construct of an IGP topology the Topology Independent
            Loop Free Alternate (TI-LFA) <xref target="I-D.ietf-rtgwg-segment-routing-ti-lfa"></xref>
            can be used to provide a local repair mechanism that offers both
            link and node protection.</t>

          <t>TI-LFA is a repair mechanism, and as such it is reactive and
            initially needs to detect a given failure. To provide fast failure
            detection the Bidirectional Forwarding Mechanism (BFD) is used.
            Consideration needs to be given to the restoration capabilities
            of the underlying transmission when deciding values for message
            intervals and multipliers to avoid race conditions, but failure
            detection in the order of 50 milliseconds can reasonably be
            anticipated. Where Link Aggregation Groups (LAG) are used,
            micro-BFD [RFC7130] can be used to similar effect. Indeed, to
            allow for potential incremental growth in capacity it is not
            uncommon for operators to provision all network links as LAG
            and use micro-BFD from the outset.</t>
        </section>

        <section title='Data Center Reconvergence'>
          <t>Clos fabrics are extremely common within data centers, and
            fundamental to a Clos fabric is the ability to load-balance using
            Equal Cost Multipath (ECMP). The number of ECMP paths will vary
            dependent on the number of devices in the parent tier but will
            never be less than two for redundancy purposes with traffic
            hashed over the available paths. In this scenario the availability
            of a backup path in the event of failure is implicit. Commonly
            within the DC, rather than computing protect paths (like LFA),
            techniques such as ‘fast rehash’ are often utilized. In this
            particular case, the failed next-hop is removed from the multi-path
            forwarding data structure and traffic is then rehashed over the
            remaining active paths.</t>

          <t>In BGP-only data centers this relies on the implementation of BGP
            multipath. As network elements in the lower tier of a Clos fabric
            will frequently belong to different ASNs, this includes the ability
            to load-balance to a prefix with different AS_PATH attribute values
            while having the same AS_PATH length; sometimes referred to as
            ‘multipath relax’ or ‘multipath multiple-AS’ [RFC7938].</t>

          <t>Failure detection relies upon declaring a BGP session down and
            removing any prefixes learnt over that session as soon as the link
            is declared down. As links between network elements predominantly
            use direct point-to-point fiber, a link failure should be detected
            within milliseconds. BFD is also commonly used to detect IP layer
            failures.</t>
        </section>

        <section title='Exchange of Inter-Domain Routes'>
          <t>Labeled unicast BGP together with SR Prefix-SID extensions are
            used to exchange PNF and/or VNF endpoints between domains to create
            end-to-end connectivity without TE. When advertising between domains
            we assume that a given BGP prefix is advertised by at least two
            border routers (DCBs, ABRs, ASBRs) making prefixes reachable via at
            least two next-hops.</t>

          <t>BGP Prefix Independent Convergence (PIC)
            <xref target="I-D.ietf-rtgwg-bgp-pic"></xref> allows failover to a pre-computed
            and pre-installed secondary next-hop when the primary next-hop
            fails and is independent of the number of destination prefixes
            that are affected by the failure. When the primary BGP next-hop
            fails, it should be clear that BGP PIC depends on the availability o
            f a secondary next-hop in the Pathlist. To ensure that multiple
            paths to the same destination are visible the BGP ADD-PATH <xref target="RFC7911"></xref>
            can be used to allow for advertisement of multiple paths for the
            same address prefix. Dual-homed EVPN/IP-VPN prefixes also have the
            alternative option of allocating different Route-Distinguishers (RDs).
            To trigger the switch from primary to secondary next-hop PIC needs
            to detect the failure and many implementations support
            ‘next-hop tracking’ for this purpose. Next-hop tracking monitors
            the routing-table and if the next-hop prefix is removed will
            immediately invalidate all BGP prefixes learnt through that
            next-hop. In the absence of next-hop tracking, multihop BFD
            <xref target="RFC5883"></xref> could optionally be used as a fast failure detection
            mechanism.</t>
        </section>

        <section title='Controller Redundancy'>
          <t>With the Interconnect controller providing an integral part of the
            networks’ capabilities a redundant controller design is clearly
            prudent. To this end we can consider both availability and
            redundancy. Availability refers to the survivability of a single
            controller system in a failure scenario. A common strategy for
            increasing the availability of a single controller system is to
            build the system in a high-availability cluster such that it
            becomes a confederation of redundant constituent parts as opposed
            to a single monolithic system. Should a single part fail, the system
            can still survive without the requirement to failover to a standby
            controller system. Methods for detection of a failure of one or more
            member parts of the cluster are implementation specific.</t>

          <t>To provide contingency for a complete system failure a
            geo-redundant standby controller system is required. When redundant
            controllers are deployed a coherent strategy is needed that
            provides a master/standby election mechanism, the ability to
            propagate the outcome of that election to network elements as
            required, an inter-system failure detection mechanism, and the
            ability to synchronize state across both systems such that the
            standby controller is fully aware of current state should it need
            to transition to master controller.</t>

          <t>Master/standby election, state synchronisation, and failure
            detection between geo-redundant sites can largely be considered a
            local implementation matter. The requirement to propagate the
            outcome of the master/standby election to network elements depends
            on a) the mechanism that is used to instantiate SR policies, and
            b) whether the SR policies are controller-initiated or
            headend-initiated, and these are discussed in the following
            sub-sections. In either scenario, state of SR policies should
            be advertised northbound to both master/standby controllers using
            either PCEP LSP State Report messages or SR policy extensions to
            BGP link-state <xref target="I-D.ietf-idr-te-lsp-distribution"></xref>.</t>

          <section title='SR Policy Initiator'>
            <t>Controller-initiated SR policies are suited for auto-creation
              of tunnels based on service route discovery and policy-driven
              route/flow programming and are ephemeral. Headend-initiated
              tunnels allow for permanent configuration state to be held on
              the headend and are suitable for static services that are not
              subject to dynamic changes. If all SR policies are
              controller-initiated, it negates the requirement to propagate
              the outcome of the master/standby election to network elements.
              This is because headends have no requirement for unsolicited
              requests to a controller, and therefore have no requirement to
              know which controller is master and which one is standby.
              A headend may respond to a message from a controller, but it
              is not unsolicited.</t>

            <t>If some or all SR policies are headend-initiated, then the
              requirement to propagate the outcome of the master/standby
              election exists. This is further discussed in the following
              sub-section.</t>
          </section>

          <section title='SR Policy Instantiation Mechanism'>
            <t>While candidate paths of SR policies may be provided using
              BGP, PCEP, Netconf, or local policy/configuration, this document
              primarily considers the use of PCEP or BGP.</t>

            <t>When PCEP <xref target="RFC5440"></xref><xref target="RFC8231"></xref><xref target="RFC8281"></xref> is used for instantiation
              of candidate paths of SR policies
              <xref target="I-D.barth-pce-segment-routing-policy-cp"></xref> every
              headend/PCC should establish a PCEP session with the master
              and standby controllers. To signal standby state to the PCC
              the standby controller may use a PCEP Notification message to
              set the PCEP session into overload state. While in this overload
              state the standby controller will accept path computation LSP
              state report (PCRpt) messages without delegation but will reject
              path computation requests (PCReq) and any path computation
              reports (PCRpt) with the delegation bit set. Further, the standby
              controller will not path computation originate initiate
              messages (PCInit) or path computation update request messages
              (PCUpd). In the event of the failure of the master controller,
              the standby controller will transition to active and remove the
              PCEP overload state. Following expiration of the PCEP
              redelegation timeout at the PCC any LSPs will be redelegated to
              the newly transitioned active controller. LSP state is not
              impacted unless redelegation is not possible before the state
              timeout interval expires.</t>

            <t>When BGP is used for instantiation of SR policies every headend
              should establish a BGP session with the master and standby
              controller capable of exchanging SR TE Policy SAFI. Candidate
              paths of SR policies are advertised only by the active
              controller. If the master controller should experience a failure,
              then SR policies learnt from that controller may be removed before
              they are re-advertised by the standby (or newly-active)
              controller. To minimize this possibility BGP speakers that 
			  advertise and instantiate SR policies can implement Long Lived
			  Graceful Retart (LLGR) <xref target="I-D.ietf-idr-long-lived-gr"></xref>,
			  also known as BGP persistence, to retain existing routes treated
			  as least-preferred until the new route arrives. In the absence of
			  LLGR, two other alternatives are possible:</t>

            <t><list style="symbols">
              <t>Provide a static backup SR policy.</t>
              <t>Fallback to the default forwarding path.</t>
            </list></t>
          </section>
          </section>

          <section title='Path and Segment Liveliness'>
            <t>When using traffic-engineered SR paths only the ingress router
              holds any state. The exception here is where BSIDs are used,
              which also implies some state is maintained at the BSID anchor.
              As there is no control plane set-up, it follows that there is
              no feedback loop from transit nodes of the path to notify the
              headend when a non-adjacent point of the SR path fails. The
              Interconnect controller however is aware of all paths that are
              impacted by a given network failure and should take the
              appropriate action. This action could include withdrawing an
              SR policy if a suitable candidate path is already in place, or
              simply sending a new SR policy with a different segment-list and
              a higher preference value assigned to it.</t>

            <t>Verification of data plane liveliness is the responsibility of
              the path headend. A given SR policy may be associated with
              multiple candidate paths and for the sake of clarity, we’ll
              assume two for redundancy purposes (which can be diversely
              routed). Verification of the liveliness of these paths can be
              achieved using seamless BFD (S-BFD)<xref target="RFC7880"></xref>,
			  which provides an in-band failure detection mechanism capable of
			  detecting failure in the order of tens of milliseconds. Upon
			  failure of the active path, failover to a secondary candidate
			  path can be activated at the path headend. Details of the actual
			  failover and revert mechanisms are a local implementation
			  matter.</t>

            <t>S-BFD provides a fast and scalable failure detection mechanism
              but is unlikely to be implemented in many VNFs given their
              inability to offload the process to purpose-built hardware. In
              the absence of an active failure detection mechanism such as
              S-BFD the failover from active path to secondary candidate
              path can be triggered using continuous path validity checks.
              One of the criteria that a candidate path uses to determine its
              validity is the ability to perform path resolution for the first
              SID to one or more outgoing interface(s) and next-hop(s). From
              the perspective of the VNF headend the first SID in the
              segment-list will very likely be the DCB (as BSID anchor) but
              could equally be another Prefix-SID hop within the data center.
              Should this segment experience a non-recoverable failure, the
              headend will be unable to resolve the first SID and the path
              will be considered invalid. This will trigger a failover action
              to a secondary candidate path. </t>

            <t>Injection of S-BFD packets is not just constrained to the
              source of an end-to-end LSP. When an S-BFD packet is injected
              into an SR policy path it is encapsulated with the label stack
              of the associated segment-list. It is possible therefore to
              run S-BFD from a BSID anchor for just that section of the
              end-to-end path (for example, from DCB to DCB). This allows a
              BSID anchor to detect failure of a path and take corrective
              action, while maintaining opacity between domains.</t>
          </section>
        </section>

        <section title='Scalability'>
          <t>There are many aspects to consider regarding scalability of the
            NFIX architecture. The building blocks of NFIX are standards-based
            technologies individually designed to scale for internet provider
            networks. When combined they provide a flexible and scalable
            solution:</t>

            <t><list style="symbols">
              <t>BGP has been proven to scale and operate with millions of
                routes being exchanged. Specifically, BGP labeled unicast has
                been deployed and proven to scale in existing seamless-MPLS
                networks.</t>

              <t>By placing forwarding instructions in the header of a packet,
                segment routing reduces the amount of state required in the
                network allowing the scale of greater number of transport
                tunnels. This aids in the feasibility of the NFIX architecture
                to permit the automated aspects of SR policy creation without
                having an impact on the state in the core of the network.</t>

              <t>The choice of utilizing native SR-MPLS or SR over IP in the
                data center continues to permit horizontal scaling without
                introducing new state inside of the data center fabric while
                still permitting seamless end to end path forwarding
                integration.</t>

              <t>BSIDs play a key role in the NFIX architecture as their use
                provides the ability to traffic-engineer across large network
                topologies consisting of many hops regardless of hardware
                capability at the headend. From a scalability perspective
                the use of BSIDs facilitates better scale due to the fact
                that detailed information about the SR paths in a domain has
                been abstracted and localized to the BSID anchor point only.
                When BSIDs are re-used amongst one or many headends they
                reduce the amount of path calculation and updates required
                at network edges while still providing seamless end to end
                path forwarding.</t>

              <t>The architecture of NFIX continues to use an independent DC
                controller. This allows continued independent scaling of data
                center management in both policy and local forwarding
                functions, while off-loading the end-to-end optimal path
                placement and automation to the Interconnect controller.
                The optimal path placement is already a scalable function
                provided in a PCE architecture. The Interconnect controller
                must compute paths, but it is not burdened by the management
                of virtual entity lifecycle and associated forwarding
                policies.</t>
            </list></t>

          <t>It must be acknowledged that with the amalgamation of the
            technology building blocks and the automation required by NFIX,
            there is an additional burden on the Interconnect controller.
            The scaling considerations are dependent on many variables, but an
            implementation of a Interconnect controller shares many overlapping
            traits and scaling concerns as PCE, where the controller and
            PCE both must:</t>

            <t><list style="symbols">
              <t>Discover and listen to topological state changes of the
                IP/MPLS topology.</t>

              <t>Compute traffic-engineered intra and inter domain paths across
                large service provider topologies.</t>

              <t>Synchronize, track and update thousands of LSPs to network
                devices upon network state changes.</t>
            </list></t>

          <t>Both entail topologies that contain tens of thousands of nodes
            and links. The Interconnect controller in an NFIX architecture
            takes on the additional role of becoming end to end service aware
            and discovering data center entities that were traditionally
            excluded from a controllers scope. Although not exhaustive, an
            NFIX Interconnect controller is impacted by some of the following:</t>

            <t><list style="symbols">
              <t>The number of individual services, the number of endpoints
                that may exist in each service, the distribution of endpoints
                in a virtualized environment, and how many data centers may
                exist. Medium or large sized data centers may be capable to
                host more virtual endpoints per host, but with the move to
                smaller edge-clouds the number of headends that require
                inter-connectivity increases compared to the density of
                localized routing in a centralized data center model. The
                outcome has an impact on the number of headend devices which
                may require tunnel management by the Interconnect controller.</t>

              <t>Assuming a given BSID satisfies SLA, the ability to re-use
                BSIDs across multiple services reduces the number of paths
                to track and manage. However, the number of color or unique
                SLA definitions, and criteria such as bandwidth constraints
                impacts WAN traffic distribution requirements. As BSIDs play
                a key role for VNF connectivity, this potentially increases
                the number of BSID paths required to permit appropriate traffic
                distribution. This also impacts the number of tunnels which may
                be re-used on a given headend for different services.</t>

              <t>The frequency of virtualized hosts being created and
                destroyed and the general activity within a given service.
                The controller must analyze, track, and correlate the activity
                of relevant BGP routes to track addition and removal of
                service host or host subnets, and determine whether new SR
                policies should be instantiated, or stale unused SR policies
                should be removed from the network.</t>

              <t>The choice of SR instantiation mechanism impacts the number of
                communication sessions the controller may require. For example,
                the BGP based mechanism may only require a small number of
                sessions to route reflectors, whereas PCEP may require a
                connection to every possible leaf in the network and any
                BSID anchors.</t>

              <t>The number of hops within one or many WAN domains may affect
                the number of BSIDs required to provide transit for VNF/PNF,
                PNF/PNF, or VNF/VNF inter-connectivity.</t>

              <t>Relative to traditional WAN topologies, traditional data
                centers are generally topologically denser in node and link
                connectivity which is required to be discovered by the
                Interconnect controller, resulting in a much larger,
                dense link-state database on the Interconnect controller.</t>

            </list></t>

            <section title='Asymmetric Model B for VPN Families'>
              <t>With the instantiation of multiple TE paths between any two
                VNFs in the NFIX network, the number of SR Policy
                (remote endpoint, color) routes, BSIDs and labels to support
                on VNFs becomes a choke point in the architecture. The fact
                that some VNFs are limited in terms of forwarding resources
                makes this aspect an important scale issue.</t>

              <t>As an example, if VNF1 and VNF2 in Figure 1 are associated
                to multiple topologies 1..n, the Interconnect controller will
                instantiate n TE paths in VNF1 to reach VNF2:</t>

              <t>[VNF1,color-1,VNF2] --> BSID 1</t>
              <t>[VNF1,color-2,VNF2] --> BSID 2</t>
			  <t>...</t>
			  <t>[VNF1,color-n,VNF2] --> BSID n</t>
			  
             <t>Similarly, m TE paths may be instantiated on VNF1 to reach
               VNF3, another p TE paths to reach VNF4, and so on for all the
               VNFs that VNF1 needs to communicate with in DC2. As it can be
               observed, the number of forwarding resources to be instantiated
               on VNF1 may significantly grow with the number of remote
               [endpoint, color] pairs, compared with a best-effort
               architecture in which the number forwarding resources in
               VNF1 grows with the number of endpoints only.</t>

            <t>This scale issue on the VNFs can be relieved by the use of
            an asymmetric model B service layer. The concept is illustrated
            in Figure 3.</t>

            <figure anchor="figure4" align='center'>			
             <artwork>

               <![CDATA[

                                                +------------+
          <-------------------------------------|    WAN     |
          |  SR Policy      +-------------------| Controller |
          |  BSID m         |   SR Policy       +------------+
          v  {DCI1,n,DCI2}  v   BSID n
                                {1,2,3,4,5,DCI2}
         +----------------+  +----------------+  +----------------+
         |     +----+     |  |                |  |     +----+     |
       +----+  | RR |    +----+              +----+    | RR |   +----+
       |VNF1|  +----+    |DCI1|              |DCI2|    +----+   |VNF2|
       +----+            +----+              +----+             +----+
         |       DC1      |  |       WAN      |  |       DC2      |
         +----------------+  +----------------+  +----------------+
   
         <-------- <-------------------------- NHS <------ <------
                              EVPN/VPN-IPv4/v6(colored)
    
         +----------------------------------->     +------------->
                   TE path to DCI2                ECMP path to VNF2
               (BSID to segment-list
                expansion on DCI1)
               ]]>

             </artwork>
             <postamble>Asymmetric Model B Service Layer</postamble>
           </figure>

          <t>Consider the different n topologies needed between VNF1 and VNF2
            are really only relevant to the different TE paths that exist in
            the WAN. The WAN is the domain in the network where there can be
            significant differences in latency, throughput or packet loss
            depending on the sequence of nodes and links the traffic goes
            through. Based on that assumption, for traffic from VNF1 to DCB2
            in Figure 4, traffic from DCB2 to VNF2 can simply take an ECMP
            path. In this case an asymmetric model B Service layer can
            significantly relieve the scale pressure on VNF1.</t>

          <t>From a service layer perspective, the NFIX architecture
            described up to now can be considered ‘symmetric’, meaning that
            the EVPN/IPVPN advertisements from e.g., VNF2 in Figure 2, are
            received on VNF1 with the next-hop of VNF2, and vice versa for
            VNF1’s routes on VNF2. SR Policies to each VNF2 [endpoint, color]
            are then required on the VNF1.</t>

          <t>In the ‘asymmetric’ service design illustrated in Figure 4, VNF2’s
            EVPN/IPVPN routes are received on VNF1 with the next-hop of DCB2,
            and VNF1’s routes are received on VNF2 with next-hop of DCB1. Now
            SR policies instantiated on VNFs can be reduced to only the number
            of TE paths required to reach the remote DCB. For example,
            considering n topologies, in a symmetric model VNF1 has to be
            instantiated with n SR policy paths per remote VNF in DC2, whereas
            in the asymmetric model of Figure 4, VNF1 only requires n SR policy
            paths per DC, i.e., to DCB2.</t>

          <t>Asymmetric model B is a simple design choice that only requires
            the ability (on the DCB nodes) to set next-hop-self on the
            EVPN/IPVPN routes advertised to the WAN neighbors and not do
            next-hop-self for routes advertised to the DC neighbors. With
            this option, the Interconnect controller only needs to establish
            TE paths from VNFs to remote DCBs, as opposed to VNFs to remote
            VNFs.</t>

          </section>
        </section>
     </section>

    <section title='Illustration of Use'>
	      <t>For the purpose of illustration, this section provides some
          examples of how different end-to-end tunnels are instantiated
          (including the relevant protocols, SID values/label stacks etc.)
          and how services are then overlaid onto those LSPs.</t>

        <section title='Reference Topology'>
      	  <t>The following network diagram illustrates the reference network
            topology that is used for illustration purposes in this section.
            Within the data centers leaf and spine network elements may be
            present but are not shown for the purpose of clarity.</t>

            <figure anchor="figure5" align='center'>
             <artwork>
        <![CDATA[

                    +----------+
                    |Controller|
                    +----------+
                      /  |  \
             +----+          +----+          +----+     +----+
     ~ ~ ~ ~ | R1 |----------| R2 |----------| R3 |-----|AGN1| ~ ~ ~ ~
     ~       +----+          +----+          +----+     +----+       ~
     ~   DC1    |                            /  |         |    DC2   ~
   +----+       |      L=5   +----+   L=5   /   |       +----+    +----+
   | Sn |       |    +-------| R4 |--------+    |       |AGN2|    | Dn |
   +----+       |   /  M=20  +----+  M=20       |       +----+    +----+
     ~          |  /                            |         |          ~
     ~       +----+     +----+    +----+     +----+     +----+       ~
     ~ ~ ~ ~ | R5 |-----| R6 |----| R7 |-----| R8 |-----|AGN3| ~ ~ ~ ~
             +----+     +----+    +----+     +----+     +----+

        ]]>
             </artwork>
             <postamble>Reference Topology</postamble>
           </figure>
          <t>The following applies to the reference topology in figure 5:</t>

          <t><list style="symbols">
            <t>Data center 1 and data center 2 both run BGP/SR. Both data
              centers run leaf/spine topologies, which are not shown for
              the purpose of clarity.</t>

            <t>R1 and R5 function as data center border routers for DC 1.
              AGN1 and AGN3 function as data center border routers for DC 2.</t>

            <t>Routers R1 through R8 form an independent ISIS-OSPF/SR
              instance.</t>

            <t>Routers R3, R8, AGN1, AGN2, and AGN2 form an independent
              ISIS-OSPF/SR instance.</t>

            <t>All IGP link metrics within the wide area network are metric
              10 except for links R5-R4 and R4-R3 which are both metric 20.</t>

            <t>All links have a unidirectional latency of 10 milliseconds
              except for links R5-R4 and R4-R3 which both have a unidirectional
              latency of 5 milliseconds.</t>

            <t>Source ‘Sn’ and destination ‘Dn’ represent one or more network
              functions.</t>
          </list></t>
        </section>

        <section title='PNF to PNF Connectivity'>
          <t>The first example demonstrates the simplest form of connectivity;
            PNF to PNF. The example illustrates the instantiation of a
            unidirectional TE path from R1 to AGN2 and its consumption by an
            EVPN service. The service has a requirement for high-throughput
            with no strict latency requirements. These service requirements
            are catalogued and represented using the color blue.</t>

            <t><list style="symbols">
              <t>An EVPN service is provisioned at R1 and AGN2.</t>

              <t>The Interconnect controller computes the path from R1 to
                AGN2 and calculates that the optimal path based on the service
                requirements and overall network optimization is
                R1-R5-R6-R7-R8-AGN3-AGN2. The segment-list to represent the
                calculated path could be constructed in numerous ways. It could
                be strict hops represented by a series of Adj-SIDs. It could be
                loose hops using ECMP-aware Node-SIDs, for example {R7, AGN2},
                or it could be a combination of both Node-SIDs and Adj-SIDs.
                Equally, BSIDs could be used to reduce the number of labels
                that need to be imposed at the headend. In this example, strict
                Adj-SID hops are used with a BSID at the area border router R8,
                but this should not be interpreted as the only way a path and
                segment-list can be represented.</t>

              <t>The Interconnect controller advertises a BGP SR Policy to R8
                with BSID 1000, and a segment-list containing segments
                {AGN3, AGN2}.</t>

              <t>The Interconnect controller advertises a BGP SR Policy to R1
                with BSID 1001, and a segment-list containing segments
                {R5, R6, R7, R8, 1000}. The policy is identified using
                the tuple [headed = R1, color = blue, endpoint = AGN2].</t>

              <t>AGN2 advertises an EVPN MAC Advertisement Route for MAC M1,
                which is learned by R1. The route has a next-hop of AGN2, an
                MPLS label of L1, and it carries a color extended community
                with the value blue.</t>

              <t>R1 has a valid SR policy [color = blue, next-hop = AGN2]
                with segment-list {R5, R6, R7, R8, 1000}. R1 therefore
                associates the MAC address M1 with that policy and programs
                the relevant information into the forwarding path.</t>

              <t>The Interconnect controller also learns the EVPN MAC Route
                advertised by AGN2. The purpose of this is two-fold. It allows
                the controller to correlate the service overlay with the
                underlying transport LSPs, thus creating a service
                connectivity map. It also allows the controller to dynamically
                create LSPs based upon service requirements if they do not
                already exist, or to optimize them if network conditions
                change.</t>
            </list></t>

        </section>

        <section title='VNF to PNF Connectivity'>
          <t>The next example demonstrates VNF to PNF connectivity and
            illustrates the instantiation of a unidirectional TE path from S1
            to AGN2. The path is consumed by an IP-VPN service that has a basic
            set of service requirements and as such simply uses IGP metric as
            a path computation objective. These basic service requirements
            are cataloged and represented using the color red.</t>

          <t>In this example S1 is a VNF with full IP routing and MPLS
            capability that interfaces to the data center underlay/overlay
            and serves as the NVO tunnel endpoint.</t>

          <t><list style="symbols">
            <t>An IP-VPN service is provisioned at S1 and AGN2.</t>

            <t>The Interconnect controller computes the path from S1 to
              AGN2 and calculates that the optimal path based on IGP metric
              is R1-R2-R3-AGN1-AGN2.</t>

            <t>The Interconnect controller advertises a BGP SR Policy to R1
              with BSID 1002, and a segment-list containing segments
              {R2, R3, AGN1, AGN2}.</t>

            <t>The Interconnect controller advertises a BGP SR Policy to S1
              with BSID 1003, and a segment-list containing segments
              {R1, 1002}. The policy is identified using the tuple
              [headend = S1, color = red, endpoint = AGN2].</t>

            <t>Source S1 learns an VPN-IPv4 route for prefix P1, next-hop
              AGN2. The route has an VPN label of L1, and it carries a color
              extended community with value red.</t>

            <t> S1 has a valid SR policy [color = red, endpoint = AGN2]
              with segment-list {R1, 1002} and BSID 1003. S1 therefore
              associates the VPN-IPv4 prefix P1 with that policy and programs
              the relevant information into the forwarding path.</t>

            <t>As in the previous example the Interconnect controller also
              learns the VPN-IPv4 route advertised by AGN2 in order to
              correlate the service overlay with the underlying transport
              LSPs, creating or optimizing them as required.</t>
            </list></t>

        </section>

        <section title='VNF to VNF Connectivity'>
          <t>The last example demonstrates VNF to VNF connectivity and
            illustrates the instantiation of a unidirectional TE path from S2
            to D2. The path is consumed by an EVPN service that requires low
            latency as a service requirement and as such uses latency as a path
            computation objective. This service requirement is cataloged and
            represented using the color green.</t>

          <t>In this example S2 is a VNF that has no routing capability. It is
            hosted by hypervisor H1 that in turn has an interface to a DC
            controller through which forwarding instructions are programmed.
            H1 serves as the NVO tunnel endpoint and overlay next-hop.</t>

          <t>D2 is a VNF with partial routing capability that is connected to
            a leaf switch L1. L1 connects to underlay/overlay in data center
            2 and serves as the NVO tunnel endpoint for D2. L1 advertises
            BGP Prefix-SID 9001 into the underlay.</t>

          <t><list style="symbols">
            <t>The relevant details of the EVPN service are entered in the
              data center policy engines within data center 1 and 2.</t>

            <t>Source S2 is turned-up. Hypervisor H1 notifies its parent DC
              controller, which in turn retrieves the service (EVPN)
              information, color, IP and MAC information from the policy
              engine and subsequently programs the associated forwarding
              entries onto S2. The DC controller also dynamically advertises
              an EVPN MAC Advertisement Route for S2’s IP and MAC into the
              overlay with next-hop H1. (This would trigger the return path
              set-up between L1 and H2 not covered in this example.)</t>

            <t>The DC controller in data center 1 learns an EVPN MAC
              Advertisement Route for D2, MAC M, next-nop L1. The route has an
              MPLS label of L2, and it carries a color extended community with
              the value green.</t>

            <t>The Interconnect controller computes the path between H1 and L1
              and calculates that the optimal path based on latency is
              R5-R4-R3-AGN1.</t>

            <t>The Interconnect controller advertises a BGP SR Policy to R5
              with BSID 1004, and a segment-list containing segments
              {R4, R3, AGN1}.</t>

            <t>The Interconnect controller advertises a BGP SR Policy to the
              DC controller in data center 1 with BSID 1005 and a segment-list
              containing segments {R5, 1004, 9001}. The policy is identified
              using the tuple [headend = H1, color = green, endpoint = L1].</t>

            <t>The DC controller in data center 1 has a valid SR policy
              [color = green, endpoint = L1] with segment-list {R5, 1004, 9001}
              and BSID 1005. The controller therefore associates the MAC
              Advertisement Route with that policy, and programs the associated
              forwarding rules into S2.</t>

            <t>As in the previous example the Interconnect controller also
              learns the MAC Advertisement Route advertised by D2 in order
              to correlate the service overlay with the underlying transport
              LSPs, creating or optimizing them as required.</t>

            </list></t>

        </section>


    </section>


     <section title="Conclusions">
       <t>The NFIX architecture provides an evolutionary path to a unified
         network fabric. It uses the base constructs of seamless-MPLS and
         adds end-to-end LSPs capable of delivering against SLAs, seamless
		 data center interconnect, service differentiation, service
         function chaining, and a Layer-2/Layer-3 infrastructure capable of
         interconnecting PNF-to-PNF, PNF-to-VNF, and VNF-to-VNF. </t>

      <t>NFIX establishes a dynamic, seamless, and automated connectivity
        model that overcomes the operational barriers and interworking issues
        between data centers and the wide-area network and delivers the
        following using standards-based protocols:</t>

      <t><list style='symbols'>
       <t>A unified routing control plane: Multiprotocol BGP (MP-BGP) to
         acquire inter-domain NLRI from the IP/MPLS underlay and the
		 virtualized IP-VPN/EVPN service overlay.</t>

       <t>A unified forwarding control plane: SR provides dynamic service
         tunnels with fast restoration options to meet deterministic
         bandwidth, latency and path diversity constraints. SR utilizes
         the appropriate data path encapsulation for seamless, end-to-end
         connectivity between distributed edge and core data centers across
         the wide-area network.</t>

        <t>Service Function Chaining: Leverage SFC extensions for BGP and
          segment routing to interconnect network and service functions into
          SFPs, with support for various data path implementations.</t>

        <t>Service Differentiation: Provide a framework that allows for
          construction of logical end-to-end networks with differentiated
          logical topologies and/or constraints through use of SR policies
          and coloring.</t>

        <t>Automation: Facilitates automation of service provisioning and
          avoids heavy service interworking at DCBs.</t>
     </list></t>

     <t>NFIX is deployable on existing data center and wide-area network
       infrastructures and allows the underlying data forwarding plane to
       evolve with minimal impact on the services plane.</t>
     </section>

     <section title="Security Considerations">
       <t>The NFIX architecture based on SR-MPLS is subject to the same
         security concerns as any MPLS network. No new protocols are
         introduced, hence security issues of the protocols encompassed by this
         architecture are addressed within the relevant individual standards
         documents.  It is recommended that the security framework for MPLS
         and GMPLS networks defined in <xref target="RFC5920"></xref> are adhered to. Although
         [RFC5920] focuses on the use of RSVP-TE and LDP control plane, the
         practices and procedures are extendable to an SR-MPLS domain.</t>

      <t>The NFIX architecture makes extensive use of Multiprotocol BGP, and
        it is recommended that the TCP Authentication Option (TCP-AO) <xref target="RFC5925"></xref>
        is used to protect the integrity of long-lived BGP sessions and any
        other TCP-based protocols.</t>

      <t>Where PCEP is used between controller and path headend the use of
        PCEPS <xref target="RFC8253"></xref> is recommended to provide confidentiality to PCEP
        communication using Transport Layer Security (TLS).</t>
    </section>

    <section anchor="Acknowledgements" title="Acknowledgements">
      <t>The authors would like to acknowledge Mustapha Aissaoui, Wim Henderickx, and Gunter Van
	  de Velde.</t>
    </section>


    <!-- Possibly a 'Contributors' section ... -->
	

    <section title="Contributors">
	
      <t>The following people contributed to the content of this document and should be considered
	  co-authors.</t>
	  
      <figure  anchor="Contributors" align="center">
                 <artwork align="left">
                   <![CDATA[
        Juan Rodriguez
	Nokia
	United States of America
	   
	Email: juan.rodriguez@nokia.com
	   
	Jorge Rabadan
	Nokia
	United States of America
	   
	Email: jorge.rabadan@nokia.com

	Nick Morris
	Verizon
	United States of America
	   
	Email: nicklous.morris@verizonwireless.com
	
	Eddie Leyton
	Verizon
	United States of America
	   
	Email: edward.leyton@verizonwireless.com	
	
                   ]]>
                 </artwork>
      </figure>	   
    </section>
	

    <section anchor="IANA" title="IANA Considerations">
      <t>This memo does not include any requests to IANA for allocation.</t>
    </section>
</middle>

<!--  *****BACK MATTER ***** -->
<back>
    <!-- References split to informative and normative -->
    <references title="Normative References">

        <!-- A *really* full, totally OTT reference - Note, the "target" attribute of the
             "reference": if you want a URI printed in the reference, this is where it goes. -->
        <reference anchor='RFC2119'
                   target='http://xml.resource.org/public/rfc/html/rfc2119.html'>
          <front>
            <title abbrev='RFC Key Words'>Key words for use in RFCs to Indicate Requirement
              Levels</title>
            <author initials='S.' surname='Bradner' fullname='Scott Bradner'>
              <organization>Harvard University</organization>
              <address>
                <postal>
                  <street>1350 Mass. Ave.</street>
                  <street>Cambridge</street>
                  <street>MA 02138</street>
                </postal>
                <phone>- +1 617 495 3864</phone>
                <email>sob@harvard.edu</email>
              </address>
            </author>
            <date year='1997' month='March' />
            <area>General</area>
            <keyword>keyword</keyword>
            <abstract>
              <t>In many standards track documents several words are used to signify
                the requirements in the specification.  These words are often
                capitalized.  This document defines these words as they should be
                interpreted in IETF documents.  Authors who follow these guidelines
                should incorporate this phrase near the beginning of their document:

                <list>
                  <t>
                    The key words &quot;MUST&quot;, &quot;MUST NOT&quot;,
                    &quot;REQUIRED&quot;, &quot;SHALL&quot;, &quot;SHALL NOT&quot;,
                    &quot;SHOULD&quot;, &quot;SHOULD NOT&quot;, &quot;RECOMMENDED&quot;,
                    &quot;MAY&quot;, and &quot;OPTIONAL&quot; in this document are to be
                    interpreted as described in RFC 2119.</t>
                </list>
              </t>
              <t>
                Note that the force of these words is modified by the requirement level of
                the document in which they are used.</t>
            </abstract>
          </front>

          <seriesInfo name='BCP' value='14' />
          <seriesInfo name='RFC' value='2119' />
          <format type='TXT' octets='4723' target='ftp://ftp.isi.edu/in-notes/rfc2119.txt' />
          <format type='HTML' octets='14486'
                  target='http://xml.resource.org/public/rfc/html/rfc2119.html' />
          <format type='XML' octets='5661'
                  target='http://xml.resource.org/public/rfc/xml/rfc2119.xml' />
        </reference>

        <!-- Right back at the beginning we defined an entity which (we asserted) would contain
             XML needed for a reference... this is where we use it. -->
                &RFC8174;
     </references>

    <references title="Informative References">

      <?rfc include='reference.I-D.ietf-nvo3-geneve'?>

      <?rfc include='reference.I-D.ietf-mpls-seamless-mpls'?>
  
      <?rfc include='reference.I-D.ietf-bess-evpn-ipvpn-interworking'?>

      <?rfc include='reference.I-D.ietf-spring-segment-routing-policy'?>

      <?rfc include='reference.I-D.ietf-rtgwg-segment-routing-ti-lfa'?>
  
      <?rfc include='reference.I-D.ietf-bess-nsh-bgp-control-plane'?>

      <?rfc include='reference.I-D.ietf-idr-te-lsp-distribution'?>

      <?rfc include='reference.I-D.barth-pce-segment-routing-policy-cp'?>

      <?rfc include='reference.I-D.filsfils-spring-sr-policy-considerations'?>	

      <?rfc include='reference.I-D.ietf-rtgwg-bgp-pic'?>	

      <?rfc include='reference.I-D.ietf-isis-mpls-elc'?>

      <?rfc include='reference.I-D.ietf-ospf-mpls-elc'?>

      <?rfc include='reference.I-D.ietf-idr-next-hop-capability'?>

      <?rfc include='reference.I-D.ietf-spring-segment-routing-central-epe'?>

      <?rfc include='reference.I-D.ietf-idr-long-lived-gr'?>		  
	  
      &RFC7938;
      &RFC7752;
      &RFC8277;
      &RFC8667;
      &RFC8665;
      &RFC8669;
      &RFC8663;
      &RFC7911;
      &RFC7880;
      &RFC4364;
      &RFC5920;
      &RFC7011;
      &RFC6241;
      &RFC6020;
      &RFC7854;
      &RFC8300;
      &RFC5440;
      &RFC7348;
      &RFC7637;
      &RFC3031;
      &RFC8014;
      &RFC8402;
      &RFC5883;
      &RFC8231;
      &RFC8281;
      &RFC5925;
      &RFC8253;
      &RFC6790;
      &RFC8662;
      &RFC8491;
      &RFC8476;	  

    </references>


</back>

</rfc>
