<?xml version="1.0" encoding="UTF-8"?>

<?xml-model href="rfc7991bis.rnc"?>
<!-- <?xml-stylesheet type="text/xsl" href="rfc2629.xslt" ?> -->
<!DOCTYPE rfc [
  <!ENTITY nbsp    "&#160;">
  <!ENTITY zwsp   "&#8203;">
  <!ENTITY nbhy   "&#8209;">
  <!ENTITY ouml   "&#246;">
  <!ENTITY uuml   "&#252;">
  <!ENTITY wj     "&#8288;">
<!-- One method to get references from the online citation libraries.
There has to be one entity for each item to be referenced.
An alternate method (rfc include) is described in the references. -->
<!ENTITY RFC0882 PUBLIC "" "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RFC.0882.xml">
<!ENTITY RFC0883 PUBLIC "" "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RFC.0883.xml">
<!ENTITY RFC1034 PUBLIC "" "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RFC.1034.xml">
<!ENTITY RFC1035 PUBLIC "" "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RFC.1035.xml">
<!ENTITY RFC2119 PUBLIC "" "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RFC.2119.xml">
<!ENTITY RFC2308 PUBLIC "" "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RFC.2308.xml">
<!ENTITY RFC4035 PUBLIC "" "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RFC.4035.xml">
<!ENTITY RFC4686 PUBLIC "" "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RFC.4686.xml">
<!ENTITY RFC4697 PUBLIC "" "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RFC.4697.xml">
<!ENTITY RFC4732 PUBLIC "" "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RFC.4732.xml">
<!ENTITY RFC5452 PUBLIC "" "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RFC.5452.xml">
<!ENTITY RFC6891 PUBLIC "" "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RFC.6891.xml">
<!ENTITY RFC7766 PUBLIC "" "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RFC.7766.xml">
<!ENTITY RFC7873 PUBLIC "" "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RFC.7873.xml">
<!ENTITY RFC7858 PUBLIC "" "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RFC.7858.xml">
<!ENTITY RFC8174 PUBLIC "" "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RFC.8174.xml">
<!ENTITY RFC8484 PUBLIC "" "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RFC.8484.xml">
<!ENTITY RFC8767 PUBLIC "" "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RFC.8767.xml">
<!ENTITY RFC8914 PUBLIC "" "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RFC.8914.xml">
<!ENTITY RFC9250 PUBLIC "" "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RFC.9250.xml">
]>
<?xml-stylesheet type="text/xsl" href="rfc2629.xslt"?>
<?rfc strict="yes" ?>
<?rfc toc="yes"?>
<?rfc tocdepth="4"?>
<?rfc symrefs="yes"?>
<?rfc sortrefs="yes" ?>
<?rfc compact="yes" ?>
<?rfc subcompact="no" ?>
<rfc xmlns:xi="http://www.w3.org/2001/XInclude" category="std" docName="draft-ietf-dnsop-caching-resolution-failures-08" ipr="trust200902" consensus="true" updates="2308, 4035, 4697" submissionType="IETF">

  <!-- category values: std, bcp, info, exp, and historic
  ipr values: full3667, noModification3667, noDerivatives3667
  you can add the attributes updates="NNNN" and obsoletes="NNNN"
  they will automatically be output with "(if approved)" -->

  <!-- ***** FRONT MATTER ***** -->

  <front>
    <!-- The abbreviated title is used in the page header - it is only necessary if the
    full title is longer than 39 characters -->

    <title abbrev="Caching Resolution Failures">Negative Caching of DNS Resolution Failures</title>

    <!-- add 'role="editor"' below for the editors if appropriate -->

    <!-- Another author who claims to be an editor -->

    <author fullname="Duane Wessels" initials="D." surname="Wessels">
      <organization>Verisign</organization>
      <address>
        <postal>
          <street>12061 Bluemont Way</street>
          <city>Reston</city>
          <region>VA</region>
          <code>20190</code>
          <country>US</country>
        </postal>
        <phone>+1 703 948-3200</phone>
        <email>dwessels@verisign.com</email>
        <uri>https://verisign.com</uri>
      </address>
    </author>

    <author fullname="William Carroll" initials="W." surname="Carroll">
      <organization>Verisign</organization>
      <address>
        <postal>
          <street>12061 Bluemont Way</street>
          <city>Reston</city>
          <region>VA</region>
          <code>20190</code>
          <country>US</country>
        </postal>
        <phone>+1 703 948-3200</phone>
        <email>wicarroll@verisign.com</email>
        <uri>https://verisign.com</uri>
      </address>
    </author>

    <author fullname="Matthew Thomas" initials="M." surname="Thomas">
      <organization>Verisign</organization>
      <address>
        <postal>
          <street>12061 Bluemont Way</street>
          <city>Reston</city>
          <region>VA</region>
          <code>20190</code>
          <country>US</country>
        </postal>
        <phone>+1 703 948-3200</phone>
        <email>mthomas@verisign.com</email>
        <uri>https://verisign.com</uri>
      </address>
    </author>

    <date year="2023"/>

    <area>General</area>

    <workgroup>Internet Engineering Task Force</workgroup>

    <keyword>DNS</keyword>
    <keyword>Negative</keyword>
    <keyword>Caching</keyword>

    <abstract>
      <t>
        In the DNS, resolvers employ caching to reduce both latency for
        end users and load on authoritative name servers.
        The process of
        resolution may result in one of three types of responses: (1) a
        response containing the requested data; (2) a response indicating
        the requested data does not exist; or (3) a non-response due to
        a resolution failure in which the resolver does not receive any
        useful information regarding the data's existence.  This document
        concerns itself only with the third type.
      </t>
      <t>
        RFC 2308 specifies requirements for DNS
        negative caching.  There, caching of type (2) responses
        is mandatory
        and caching of type (3) responses
        is optional.  This document updates RFC 2308
        to require negative caching
        for DNS resolution failures.
      </t>
      <t>
        RFC 4035 allows DNSSEC validation failure caching. This document updates RFC 4035 
        to require caching for DNSSEC validation failures. 
      </t>
      <t>
        RFC 4697 prohibits aggressive requerying for NS records at a failed zone's parent 
        zone. This document updates RFC 4697 to expand this requirement to all query types and to all 
        ancestor zones.
      </t>
    </abstract>
  </front>

  <middle>

    <section title="Introduction">

      <t>
        Caching has always been a fundamental component of DNS resolution
        on the Internet.  For example <xref target="RFC0882"/> states:
      </t>
      <t>
         "The sheer size of the database and frequency of updates suggest
         that it must be maintained in a distributed manner, with local
         caching to improve performance."
      </t>
      <t>
        The early DNS RFCs (<xref target="RFC0882"/>, <xref
        target="RFC0883"/>, <xref target="RFC1034"/>, and <xref
        target="RFC1035"/>) primarily discuss caching in the context
        of what <xref target="RFC2308"/> calls "positive" responses,
        that is, when the response includes the requested data.
        In this case, a TTL is associated with each resource record in
        the response.  Resolvers can cache and reuse the data until the
        TTL expires.
      </t>
      <t>
        Section 4.3.4 of <xref target="RFC1034"/> describes negative
        response caching, but notes it is optional and only talks
        about name errors (NXDOMAIN).  This is the origin of using
        the SOA MINIMUM field as a negative caching TTL.
      </t>
      <t>
        <xref target="RFC2308"/> updated <xref target="RFC1034"/>
        to specify new requirements for DNS negative caching, including
        making it mandatory for caching resolvers to cache
        name error (NXDOMAIN) and no data (NODATA) responses 
        when a SOA record is available to provide a TTL.
        <xref target="RFC2308"/> further specified optional negative caching for two DNS 
        resolution failure cases: server failure and dead / unreachable servers.
      </t>
      <t>
        This document updates <xref target="RFC2308"/> to require
        negative caching of all DNS resolution failures 
        and provides additional examples of resolution failures.
        This document also updates <xref target="RFC4035"/> to require
        caching for DNSSEC validation failures as well as <xref target="RFC4697"/>
        to expand the scope of prohibiting aggressive requerying for NS
        records at a failed zone's parent zone to all query types and
        to all ancestor zones.
      </t>

      <section title="Motivation">
        <t>
          Operators of DNS services have known for some time that
          recursive resolvers become more aggressive when they
          experience resolution failures.  A number of different
          anecdotes, experiments, and incidents support this
          claim.
        </t>
        <t>
          In December 2009, a secondary server for a number of
          in-addr.arpa subdomains saw its traffic suddenly double, and
          queries of type DNSKEY in particular increase by approximately
          two orders of magnitude, coinciding with a DNSSEC key rollover
          by the zone operator <xref target="roll-over-and-die"/>.
          This predated a signed root zone and an operating system
          vendor was providing non-root trust anchors to the recursive
          resolver, which became out of date following the rollover.
          Unable to validate responses for the affected in-addr.arpa
          zones, recursive resolvers aggressively retried their queries.
        </t>
        <t>
          In 2016, the internet infrastructure company Dyn experienced
          a large attack that impacted many high-profile customers.
          As documented in a technical presentation detailing the attack <xref target="dyn-attack"/>, Dyn staff wrote:
          "At this point we are now experiencing botnet attack traffic
          and what is best classified as a 'retry storm'.  Looking at
          certain large recursive platforms &gt; 10x normal volume."
        </t>
        <t>
          In 2018 the root zone key signing key (KSK) was rolled over
          <xref target="root-ksk-roll"/>.  Throughout the rollover
          period, the root servers experienced a significant increase in
          DNSKEY queries.  Before the rollover, a.root-servers.net and
          j.root-servers.net together received about 15 million DNSKEY
          queries per day.  At the end of the revocation period, they
          received 1.2 billion per day -- an 80x increase.  Removal of
          the revoked key from the zone caused DNSKEY queries to drop
          to post-rollover but pre-revoke levels, indicating there is
          still a population of recursive resolvers using the previous
          root trust anchor and aggressively retrying DNSKEY queries.
        </t>
        <t>
          In 2021, Verisign researchers used botnet query traffic
          to demonstrate that certain large, public recursive DNS
          services exhibit very high query rates when all authoritative
          name servers for a zone return REFUSED or SERVFAIL <xref
          target="botnet"/>. When the authoritative servers were configured normally, query rates for
          a single botnet domain averaged approximately 50 queries
          per second.  However, with the servers configured to return SERVFAIL,
          the query rate increased to 60,000 per second.  Furthermore,
          increases were also observed at the Root and TLD levels,
          even though delegations at those levels were unchanged and
          continued operating normally.
        </t>
        <t>
          Later that same year, on October 4, Facebook experienced a
          widespread and well-publicized outage <xref target="fb-outage"/>. During the 6-hour outage,
          none of Facebook's authoritative name servers were reachable and
          did not respond to queries. Recursive name servers attempting to
          resolve Facebook domains experienced timeouts. During this time,
          query traffic on the .COM/.NET infrastructure increased from
          7,000 to 900,000 queries per second <xref target="fb-outage-verisign"/>.
        </t>
      </section>

      <section title="Related Work">
        <t>
          <xref target="RFC2308"/> describes negative caching for four
          types of DNS queries and responses: Name errors, no data,
          server failures, and dead / unreachable servers.  It places
          the strongest requirements on negative caching
          for name errors and no data responses, while server failures
          and dead servers are left as optional.
        </t>
        <t>
          <xref target="RFC4697"/> is a Best Current Practice that
          documents observed resolution misbehaviors.  It describes a
          number of situations that can lead to excessive queries from
          recursive resolvers, including: requerying for delegation data,
          lame servers, responses blocked by firewalls, and records
          with zero TTL.  <xref target="RFC4697"/> makes a number of
          recommendations, varying from "SHOULD" to "MUST."
        </t>
        <t>
          An expired Internet-Draft describes "The DNS thundering herd
          problem" <xref target="thundering-herd"/> as a situation arising
          when cached data expires at the same time for a large number
          of users.  Although that document is not focused on negative
          caching, it does describe the benefits of combining multiple,
          identical queries to upstream name servers.  That is, when
          a recursive resolver receives multiple queries for the same
          name, class, and type that cannot be answered from cached data,
          it should combine or join them into a single upstream query,
          rather than emit repeated, identical upstream queries.
        </t>
        <t>
          <xref target="RFC5452"/>, "Measures for Making DNS More
          Resilient against Forged Answers," includes a section that
          describes the phenomenon known as birthday attacks.  Here,
          again, the problem arises when a recursive resolver emits
          multiple, identical upstream queries.  Multiple outstanding
          queries makes it easier for an attacker to guess and correctly
          match some of the DNS message parameters, such as the port
          number and ID field.  This situation is further exacerbated in the
          case of timeout-based resolution failures.  DNSSEC, of course,
          is a suitable defense to spoofing attacks.
        </t>
        <t>
          <xref target="RFC8767"/> describes "Serving Stale Data to Improve
          DNS Resiliency." This permits a recursive resolver to return
          possibly stale data when it is unable to refresh cached,
          expired data.  It introduces the idea of a failure recheck
          timer and says: "Attempts to refresh from non-responsive or
          otherwise failing authoritative nameservers are recommended
          to be done no more frequently than every 30 seconds."
        </t>
      </section>

      <section title="Terminology">
        <t>
          The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT",
          "SHOULD", "SHOULD NOT", "RECOMMENDED", "NOT RECOMMENDED", "MAY", and
          "OPTIONAL" in this document are to be interpreted as described in
          BCP 14 <xref target="RFC2119"/> <xref target="RFC8174"/> when, and only when, they appear in all
          capitals, as shown here.
        </t>
        <ul>
          <li><t>DNS Transport: In this document, DNS transport means a protocol
            used to transport DNS messages between a client and a server.  This includes
            "classic DNS" transports, i.e., DNS-over-UDP and DNS-over-TCP <xref target="RFC1034" /> <xref target="RFC7766" />, as
            well as newer encrypted DNS transports such as DNS-over-TLS <xref target="RFC7858" />,
            DNS-over-HTTPS <xref target="RFC8484" />, DNS-over-QUIC <xref target="RFC9250" />, 
            and similar 
            communication of DNS messages using other protocols.            
            NOTE: at the time of this writing not all DNS transports are standardized for all types
            of servers, but may become standardized in the future.</t></li>
        </ul>
      </section>
    </section>

    <section title="Conditions That Lead to DNS Resolution Failures">

      <t>
        A DNS resolution failure occurs when none of the servers available
        to a resolver client provide any useful response data for a
        particular query name, type, and class. A response is considered
        useful when it provides either the requested data, a referral to a descendant zone,
        or an indication that no data exists at the given name.
      </t>
      <t>
        It is common for resolvers to have multiple servers from
        which to choose for a particular query.  For example,
        in the case of stub-to-recursive, the stub resolver may be
        configured with multiple recursive resolver addresses.  In the case of
        recursive-to-authoritative, a given zone usually has more than
        one name server (NS record), each of which can have multiple
        IP addresses and multiple DNS transports.
      </t>
      <t>
        Nothing in this document prevents a resolver from retrying a
        query at a different server, or the same server over a different
        DNS transport.  In the case of timeouts, a resolver can retry the
        same server and DNS transport a limited number of times.
      </t>
      <t>
        If any one of the available servers provides a useful response, then
        it is not considered a resolution failure.  However, if
        none of the servers for a given query tuple &lt;name, type, class&gt;
        provide a useful response, the result is a resolution failure.
      </t>
      <t>
        Note that NXDOMAIN and NOERROR/NODATA responses are not conditions
        for resolution failure.  In these cases, the server is providing
        a useful response, either indicating that a name does not exist,
        or that no data of the requested type exists at the name.
        These negative responses can be cached as described in <xref
        target="RFC2308"/>.
      </t>
      <t>
        The remainder of this section describes a number of different
        conditions that can lead to resolution failure. This section is not
        exhaustive. Additional conditions
        may be expected to cause similar resolution failures.
      </t>

      <section title="SERVFAIL Responses">
        <t>
          Server failure is defined in <xref target="RFC1035"/> as
          "The name server was unable to process this query due to a
          problem with the name server." A server failure is signaled
          by setting the RCODE field to SERVFAIL.
        </t>
        <t>
          Authoritative servers
          return SERVFAIL when they don't have
          any valid data for a zone.  For example, a secondary server has
          been configured to serve a particular zone, but is unable to
          retrieve or refresh the zone data from the primary server.
        </t>
        <t>
          Recursive servers return SERVFAIL in response to a
          number of different conditions, including many described below.
        </t>
        <t>
          Although the extended DNS errors method exists "primarily to extend SERVFAIL to
          provide additional information," it "does not change the processing of RCODEs"
          <xref target="RFC8914"/>.
          This document operates at the level of resolution failure and does not concern particular causes.
        </t>
      </section>

      <section title="REFUSED Responses">
        <t>
          A name server returns a message with the RCODE field set to REFUSED when it refuses to
          process the query, e.g., for policy or other reasons <xref target="RFC1035"/>.
        </t>
        <t>
          Authoritative servers generally return REFUSED when processing
          a query for which they are not authoritative.  For example,
          a server that is configured to be authoritative for only the
          example.net zone, may return REFUSED in response to a query
          for example.com.
        </t>
        <t>
          Recursive servers generally return REFUSED for query
          sources that do not match configured access control lists.
          For example, a server that is configured to allow queries from
          only 2001:db8:1::/48 may return REFUSED in response to a query
          from 2001:db8:5::1.
        </t>
      </section>


      <section title="Timeouts and Unreachable Servers">
        <t>
          A timeout occurs when a resolver fails to receive any
          response from a server within a reasonable amount of time.
          Additionally, a DNS transport may more quickly indicate lack
          of reachability in a way that wouldn't be considered a timeout.
          For example: an ICMP port unreachable message, a TCP "connection refused" error, or a TLS handshake failure.
          <xref target="RFC2308"/> refers to these conditions collectively as "dead / unreachable
          servers."
        </t>
        <t>
          Note that resolver implementations may have two types of
          timeouts: a smaller timeout which might trigger a query retry
          and a larger timeout after which the server is considered
          unresponsive.  <xref target="reqs-retries-timeouts"/> discusses
          the requirements for resolvers when retrying queries.
        </t>
        <t>
          Timeouts can present a particular problem for negative
          caching, depending on how the resolver handles multiple,
          outstanding queries for the same &lt;query name, type,
          class&gt; tuple.  For example, consider a very popular
          website in a zone whose name servers are all unresponsive.
          A recursive resolver might receive tens or hundreds of queries
          per second for the popular website.  If the recursive server
          implementation "joins" these outstanding queries together,
          then it only sends one recursive-to-authoritative query for
          the numerous pending stub-to-recursive queries.  If, however,
          the implementation does not join outstanding queries together,
          then it sends one recursive-to-authoritative query for each
          stub-to-recursive query.  If the incoming query rate is high
          and the timeout is large, this might result in hundreds or
          thousands of recursive-to-authoritative queries while waiting
          for an authoritative server to time out.
        </t>
        <t>
          A recursive resolver that does not join outstanding queries
          together is more susceptible to birthday attacks (<xref
          target="RFC5452"/> Section 5), especially when those queries
          result in timeouts.
        </t>
      </section>

      <section title="Delegation Loops">
        <t>
          A delegation loop, or cycle, can occur when one domain utilizes
          name servers in a second domain, and the second domain uses
          name servers in the first.  For example:
        </t>
        <figure><artwork align="left"><![CDATA[
FOO.EXAMPLE.    NS      NS1.EXAMPLE.COM.
FOO.EXAMPLE.    NS      NS2.EXAMPLE.COM.

EXAMPLE.COM.    NS      NS1.FOO.EXAMPLE.
EXAMPLE.COM.    NS      NS2.FOO.EXAMPLE.
]]></artwork></figure>
        <t>
          In this example, no names under foo.example or example.com can be
          resolved because of the delegation loop.  Note that a delegation loop
          may involve more than two domains.  A resolver that does not
          detect delegation loops may generate DDoS-levels of attack traffic
          to authoritative name servers, as documented in the TsuNAME vulnerability
          <xref target="TsuNAME"/>.
        </t>
      </section>

      <section title="Alias Loops">
        <t>
          An alias loop, or cycle, can occur when one CNAME or DNAME RR refers to
          a second name, which in turn is specified as an alias for the first.
          For example:
        </t>
        <figure><artwork align="left"><![CDATA[
APP.FOO.EXAMPLE.        CNAME   APP.EXAMPLE.NET.
APP.EXAMPLE.NET.        CNAME   APP.FOO.EXAMPLE.
]]></artwork></figure>
      <t>
        The need to detect CNAME loops has been known since at least
        <xref target="RFC1034"/> which states in Section 3.6.2:
      </t>
      <t>
        "Of course, by the robustness principle, domain software should
        not fail when presented with CNAME chains or loops; CNAME chains
        should be followed and CNAME loops signaled as an error."
      </t>
      </section>

      <section title="DNSSEC Validation Failures">
        <t>
          For zones that are signed with DNSSEC, a resolution failure can
          occur when a security-aware resolver believes it should be able
          to establish a chain-of-trust for an RRset but is unable to do
          so, possibly after trying multiple authoritative name servers.
          DNSSEC validation failures may be due to signature mismatch,
          missing DNSKEY RRs, problems with denial-of-existence records,
          clock skew,
          or other reasons.
        </t>
        <t>
          Section 4.7 of <xref target="RFC4035"/> already discusses
          the requirements and reasons for caching validation failures.
          <xref target="dnssec-reqs"/> of this document strengthens those requirements.
        </t>
      </section>

      <section title="FORMERR Responses">
        <t>
          A name server returns a message with the RCODE field set to
          FORMERR when it is unable to interpret the query <xref target="RFC1035"/>.  FORMERR
          responses are often associated with problems processing EDNS(0)
          Extensions <xref target="RFC6891"/>.  Authoritative servers
          may return FORMERR when they do not implement EDNS(0), or
          when EDNS(0) option fields are malformed, but not for unknown
          EDNS(0) options.
        </t>
        <t>
          Upon receipt of a FORMERR response, some recursive clients will
          retry their queries without EDNS(0), while others will not.  Nonetheless, resolution failures
          from FORMERR responses are rare.
        </t>
      </section>
     
    </section>

    <section title="Requirements for Caching DNS Resolution Failures">

      <section title="Retries and Timeouts" anchor="reqs-retries-timeouts">
        <t>
          A resolver MUST NOT retry a given query to a server address over a given DNS transport more than twice
          (i.e., three queries in total) before considering the server address
          unresponsive over that DNS transport for that query.
        </t>
        <t>
          A resolver MAY retry a given query over a different DNS transport to the same server
          if it has reason to believe the DNS transport is available for that server and is
          compatible with the resolver's security policies.
        </t>
        <t>
          This document does not place any requirements on how long an implementation should
          wait before retrying a query (aka timeout value),
          which may be implementation- or configuration-dependent.
          It is generally expected that typical timeout values range
          from 3 to 30 seconds.
        </t>
      </section>

      <section title="Caching" anchor="caching">
        <t>
          Resolvers MUST implement a cache for resolution failures.
          The purpose of this cache is to eliminate repeated upstream
          queries that cannot be resolved.
          When an incoming query matches a cached resolution failure, the resolver MUST NOT send
          any corresponding outgoing queries until after the cache entries expire.
        </t>
        <t>
          Implementation details for such a cache are not specified
          in this document.  The implementation might cache different
          resolution failure conditions differently.  For example, DNSSEC
          validation failures might be cached according to the queried
          name, class, and type, whereas unresponsive servers might be
          cached only according to the server's IP address.
          Developers should document their implementation choices so
          that operators know what behaviors to expect when resolution
          failures are cached.
        </t>
        <t>
          Resolvers MUST cache resolution failures for at least 1 second.
          Resolvers MAY cache different types of resolution failures for different (i.e., longer) amounts of time.
          Consistent with <xref target="RFC2308"/>, resolution failures MUST NOT be cached for longer than
          5 minutes.
        </t>
        <t>
          The minimum cache duration SHOULD be configurable by the operator.
          A longer cache duration for resolution failures will
          reduce the processing burden from repeated queries, but
          may also increase the time to recover from transitory issues.
        </t>
        <t>
          Resolvers SHOULD employ an exponential or linear backoff algorithm to
          increase the cache duration for persistent resolution failures. For example,
          the initial time for negatively caching a resolution failure
          might be set to 5 seconds, and increased after each retry that results
          in another resolution failure, up to a configurable maximum, not to exceed the 5-minute upper limit.
        </t>
        <t>
          Notwithstanding the above, resolvers SHOULD implement measures to mitigate resource exhaustion
          attacks on the failed resolution cache. That is, the resolver should limit the amount of memory
          and/or processing time devoted to this cache.
        </t>
      </section>

      <section title="Requerying Delegation Information">
        <t>
      	Section 2.1 of <xref target="RFC4697"/> identifies circumstances in which "every 
      	  name server in a zone's NS RRSet is unreachable (e.g., during a network outage), 
      	  unavailable (e.g., the name server process is not running on the server host), or
          misconfigured (e.g., the name server is not authoritative for the given zone, 
          also known as 'lame')." It prohibits unnecessary "aggressive requerying" to the
          parent of a non-responsive zone by sending NS queries.
        </t>
        <t>
          The problem of aggressive requerying to parent zones is not limited to queries of type NS.
          This document updates the requirement from section 2.1.1 of <xref target="RFC4697"/>
          to apply more generally:
          Upon encountering a zone whose name servers are all non-responsive,
          a resolver MUST cache the resolution failure. 
          Furthermore, the resolver MUST limit queries to the non-responsive
          zone's parent zone (and to other ancestor zones) just as it
          would limit subsequent queries to the non-responsive zone.
        </t>
      </section>

      <section title="DNSSEC Validation Failures" anchor="dnssec-reqs">
        <t>
          Section 4.7 of <xref target="RFC4035"/> states:
        </t>
        <t>
          To prevent such unnecessary DNS traffic, security-aware
          resolvers MAY cache data with invalid signatures, with some
          restrictions.
        </t>
        <t>
          This document updates <xref target="RFC4035"/> with the following, stronger requirement:
        </t>
        <t>
          To prevent such unnecessary DNS traffic, security-aware
          resolvers MUST cache DNSSEC validation failures, with some
          restrictions.
        </t>
        <t>
          One of the restrictions mentioned in <xref target="RFC4035"/>
          is to use a small TTL when caching data that fails DNSSEC
          validation. This is, in part, because the provided TTL cannot
          be trusted.  The advice from <xref target="caching"/>
          herein can be used as guidance on TTLs for caching DNSSEC
          validation failures.
        </t>
      </section>

    </section>

    <section title="IANA Considerations" anchor="iana">
      <t>
        This document has no IANA actions.
      </t>
    </section>

    <section title="Security Considerations" anchor="security">
      <t>
        As noted in <xref target="caching"/>, an attacker might attempt a resource
        exhaustion attack by sending queries for a large number
        of names and/or types that result in resolution failure.  Resolvers
        SHOULD implement measures to protect themselves and bound the
        amount of memory devoted to caching resolution failures.
      </t>
      <t>
        A cache poisoning attack (see section 2.2 of <xref target="RFC7873"/>)
        resulting in denial of service
        may be possible because failure messages cannot be
        signed. An attacker might generate queries and send forged failure messages,
        causing the resolver to cease sending queries to the authoritative name server
        (see 2.6 of <xref target="RFC4732"/> for a similar "data corruption attack").
        However, this would require continued spoofing throughout the backoff period and required attacks
        due to the 5 minute cache limit. As in section 4.1.12 of <xref target="RFC4686"/>,
        this attack's effects would be "localized and of limited duration."
      </t>
    </section>

    <section title="Privacy Considerations" anchor="privacy">
      <t>This specification has no impact on user privacy.</t>
    </section>

    <section title="Acknowledgments" anchor="acknowledgments">
      <t>
        The authors wish to thank
        Mukund Sivaraman,
        Petr Spacek,
        Peter van Dijk,
        Tim Wicinksi,
        Joe Abley,
        Evan Hunt,
        Barry Leiba,
        Lucas Pardue,
        Paul Wouters,
        and other members of the DNSOP working group for their feedback and contributions.
      </t>
    </section>

    <section anchor="Changes" title="Change Log">
      <t>RFC Editor: Please remove this section before publication.</t>
      <t>This section lists substantial changes to the document as it is being worked on.</t>
      <t>From -00 to -01:
      <list style="symbols">
        <t>use phrase "the initial TTL for negatively caching a resolution failure" instead of "negative cache TTL"</t>
        <t>typos, etc</t>
      </list></t>
      <t>From dwmtwc-01 to ietf-00:
      <list style="symbols">
        <t>Adopted by WG</t>
      </list></t>
      <t>From -00 to -01:
      <list style="symbols">
        <t>Clarify retries and timeouts to apply on a per-query basis.</t>
        <t>Say more about the 5 second caching requirement in TTLs section.</t>
        <t>Expanded opening paragraphs of section 2, now titled "Conditions That Lead To DNS Resolution Failures".</t>
        <t>Text from the former section 3.3 ("Scope") moved to top of section 2.</t>
        <t>Section 3.2 was formerly "TTLs" and is now "Caching".  The draft no longer requires e.g. caching by tuples, but now just requires caching failures so that repeated queries are not sent out.</t>
        <t>State that resolvers should protect themselves from cache resource exhaustion attacks.</t>
      </list></t>
      <t>From -01 to -02:
      <list style="symbols">
         <t>Added cache poisoning attack to Security Considerations.</t>
      </list></t>
      <t>From -02 to -03:
      <list style="symbols">
         <t>Added missing reference to Verisign blog post.</t>
      </list></t>
      <t>From -03 to -04:
      <list style="symbols">
         <t>Address most of Peter van Dijk's DNS Directorate review comments.</t>
         <t>Removed "For Discussion" section from introduction referencing apparent inconsistent RFC2119 keyword use in RFC2308.</t>
         <t>Replaced "For Discussion" section from "Requerying Delegation Information" to generalize RFC 4697 requirements not to requery parent zones to cover all query types.</t>
         <t>Replaced "For Discussion" section from "DNSSEC Validation Failures" to strengthen RFC 4035 to require caching of DNSSEC validation failures.</t>
         <t>Added RFC 4035 and RFC 4697 to updated RFCs list.</t>
         <t>Added (empty) Implementation Status section.</t>
      </list></t>
      <t>From -04 to -05:
      <list style="symbols">
         <t>Expanded abstract to include updates to RFCs 4035 and 4697.</t>
         <t>Removed reference to unused terms from RFC 8126.</t>
         <t>Reworded "server transport" to "a server address over a given transport".</t>
         <t>Added explanatory text in "Server Failure" section for exclusion of extended DNS errors</t>
         <t>Changed "Timeouts" section to "Timeouts and Unreachable Servers" and added reference to transport layer indicators from RFC 2308.</t>
         <t>Clarified meaning of "timeout value".</t>
      </list></t>
      <t>From -05 to -06:
      <list style="symbols">
        <t>Changed minimum 5 second caching to 1 second, with other changes to give implementors and operators more leeway.</t>
        <t>Changed "exponential backoff" to more general concept of increasing backoff.</t>
        <t>Added some implementation status notes for BIND, from dnsop list email.</t>
      </list>
      </t>
      <t>From -06 to -07:
      <list style="symbols">
        <t>Artart review: minor editorial clarifications</t>
        <t>Genart review: remove confusing and superfluous section references.</t>
        <t>Genart review: clarify resolution failure caching time range.</t>
        <t>Genart review: better define DNS transports</t>
        <t>Dnsdir review: clarify FORMERR response retries.</t>
      </list>
      </t>
      <t>From -07 to -08:
      <list style="symbols">
        <t>"only exacerbated" -> "further exacerbated"</t>
        <t>lowercase IPv6 addresses</t>
        <t>lowercase example domain in text</t>
        <t>updated introduction to include all updated RFCs</t>
        <t>change 3.2 SHOULD to should</t>
        <t>section 3.4: say a little about "some restrictions" from RFC 4035</t>
        <t>Intdir telechat review: a few grammatical nits</t>
        <t>Various IESG reviewer suggestions</t>
      </list>
      </t>
    </section>

    <section title="Implementation Status">
      <t>
        RFC Editor: Please remove this section before publication.
      </t>
      <t>
        This section records the status of known implementations of the
        protocol defined by this specification at the time of posting of
        this Internet-Draft, and is based on a proposal described in
        RFC 7942.  The description of implementations in this section is
        intended to assist the IETF in its decision processes in
        progressing drafts to RFCs.  Please note that the listing of any
        individual implementation here does not imply endorsement by the
        IETF.  Furthermore, no effort has been spent to verify the
        information presented here that was supplied by IETF contributors.
        This is not intended as, and must not be construed to be, a
        catalog of available implementations or their features.  Readers
        are advised to note that other implementations may exist.
      </t>
      <section title="BIND">
        <t>
          The following is excerpted from a message to the dnsop mailing list regarding
          how BIND caches resolution failures:
        </t>
        <t>
          BIND implemented a SERVFAIL cache in 2014 with a default
          cache duration of 10 seconds; after a slew of complaints, in 2015 we
          lowered it to 1 second, and also reduced the configurable maximum from
          5 minutes to 30 seconds. The reason was that certain common failure
          conditions are transitory, and it's not unreasonable to prioritize
          rapid recovery.
        </t>
        <t>
          Now, to be clear, the comparison isn't exactly apples to apples: the BIND
          SERVFAIL cache is a somewhat stupider mechanism than the one outlined in
          the draft. It caches *all* SERVFAIL responses, regardless of the reason
          they were generated. For example: when the cache is cold, a query may time
          out or hit DDoS mitigation limits before it's finished getting through the
          whole iteration process; an immediate retry would start further along the
          delegation chain and would succeed. Such problems weren't noticeable until
          we implemented the 10-second cache, but became very noticeable afterward.
        </t>
        <t>
          If we were able to selectively cache *only* those SERVFAILs that are
          unlikely to recover soon, then five seconds might indeed be a good starting
          point. But, with our relatively dumb cache, we found that one second did a
          fairly good job reducing the processing burden from repeated queries, and
          eliminated the user complaints about the resolver taking forever to recover
          from short-lived problems. It's been working well enough that it hasn't
          been a priority to develop a more complex failure cache.
        </t>
      </section>
    </section>

  </middle>
  <back>

    <references title="Normative References">
      &RFC1034;
      &RFC1035;
      &RFC2119;
      &RFC2308;
      &RFC4035;
      &RFC4697;
      &RFC8174;
    </references>

    <references title="Informative References">
      &RFC0882;
      &RFC0883;
      &RFC4686;
      &RFC4732;
      &RFC5452;
      &RFC6891;
      &RFC7766;
      &RFC7858;
      &RFC7873;
      &RFC8484;
      &RFC8767;
      &RFC8914;
      &RFC9250;

     <reference anchor="botnet" target="https://indico.dns-oarc.net/event/38/contributions/841/">
        <front>
          <title>Botnet Traffic Observed at Various Levels of the DNS Hierarchy</title>
          <author initials="D." surname="Wessels" fullname="Duane Wessels"/>
          <author initials="M." surname="Thomas" fullname="Matt Thomas"/>
          <date year="2021" month="May"/>
        </front>
     </reference>

     <reference anchor="fb-outage" target="https://engineering.fb.com/2021/10/05/networking-traffic/outage-details/">
        <front>
          <title>More details about the October 4 outage</title>
          <author initials="S." surname="Janardhan" fullname="Santosh Janardhan"/>
          <date year="2021" month="October"/>
        </front>
     </reference>

     <reference anchor="fb-outage-verisign" target="https://blog.verisign.com/security/facebook-dns-outage/">
        <front>
          <title>Observations on Resolver Behavior During DNS Outages</title>
          <author>
          	<organization>Verisign</organization>
          </author>
          <date year="2022" month="January" day="20"/>
        </front>
     </reference>

     <reference anchor="TsuNAME" target="https://dl.acm.org/doi/10.1145/3487552.3487824">
        <front>
          <title>TsuNAME: exploiting misconfiguration and vulnerability to DDoS DNS</title>
          <author initials="G. C. M." surname="Moura" fullname="Giovane C. M. Moura"/>
          <author initials="S." surname="Castro" fullname="Sebastian Castro"/>
          <author initials="J." surname="Heidemann" fullname="John Heidemann"/>
          <author initials="W." surname="Hardaker" fullname="Wes Hardaker"/>
          <date year="2021" month="November"/>
        </front>
     </reference>

     <reference anchor="roll-over-and-die" target="https://www.potaroo.net/ispcol/2010-02/rollover.html">
        <front>
          <title>Roll Over and Die?</title>
          <author initials="G." surname="Michaleson" fullname="George Michaleson"/>
          <author initials="P." surname="Wallstr&ouml;m" fullname="Patrik Wallstr&ouml;m"/>
          <author initials="R." surname="Arends" fullname="Roy Arends"/>
          <author initials="G." surname="Huston" fullname="Geoff Huston"/>
          <date year="2010" month="February"/>
        </front>
     </reference>

     <reference anchor="dyn-attack" target="https://ccnso.icann.org/sites/default/files/file/field-file-attach/2017-04/presentation-oracle-dyn-ddos-dns-13mar17-en.pdf">
        <front>
          <title>Dyn, DDoS, and DNS</title>
          <author initials="A." surname="Sullivan" fullname="Andrew Sullivan"/>
          <date year="2017" month="March"/>
        </front>
     </reference>

     <reference anchor="root-ksk-roll" target="https://dl.acm.org/doi/10.1145/3355369.3355570">
        <front>
          <title>Roll, Roll, Roll Your Root: A Comprehensive Analysis of the First Ever DNSSEC Root KSK Rollover</title>
          <author fullname="Moritz M&uuml;ller" initials="M." surname="M&uuml;ller"/>
          <author fullname="Matthew Thomas" initials="M." surname="Thomas"/>
          <author fullname="Duane Wessels" initials="D." surname="Wessels"/>
          <author fullname="Wes Hardaker" initials="W." surname="Hardaker"/>
          <author fullname="Taejoong Chung" initials="T." surname="Chung"/>
          <author fullname="Willem Toorop" initials="W." surname="Toorop"/>
          <author fullname="Roland van Rijswijk-Deij" initials="R.v." surname="Rijswijk-Deij"/>
          <date year="2019" month="Oct"/>
        </front>
     </reference>

      <reference anchor="thundering-herd" target="https://datatracker.ietf.org/doc/draft-muks-dnsop-dns-thundering-herd/">
        <front>
          <title>The DNS thundering herd problem (expired Internet-Draft)</title>
          <author fullname="Mukund Sivaraman" initials="M." surname="Sivaraman"/>
          <author fullname="Cricket Liu" initials="C." surname="Liu"/>
          <date year="2020" month="Jun"/>
        </front>
      </reference>

    </references>

  </back>
</rfc>
