Control plane reads wrong values (INT)

Good morning, and before of all thank you for taking your time to read this.
I am implementing a simple form of INT reporting from the dataplane to the control plane, using the BMv2 software switch. The current P4 program is this one:

// SPDX-License-Identifier: Apache-2.0
/* -*- P4_16 -*- */
#include <core.p4>
#include <v1model.p4>

#define MAX_REG_SIZE 4096
#define MAX_PORTS 24

const bit<16> TYPE_IPV4 = 0x800;
const bit<16> TYPE_INT  = 0x808;
const bit<8> TYPE_ICMP = 0x001;

const bit<9> CPU_PORT = 142;
const bit<32> CPU_MIRROR = 100;

/*************************************************************************
*********************** H E A D E R S  ***********************************
*************************************************************************/

typedef bit<9>  egressSpec_t;
typedef bit<48> macAddr_t;
typedef bit<32> ip4Addr_t;

typedef bit<48> time_t;

header ethernet_t {
    macAddr_t dstAddr;
    macAddr_t srcAddr;
    bit<16>   etherType;
}

@controller_header("packet_in")
header int_t {
    bit<16> protocol;
    bit<48> ingress_global_timestamp;
    bit<48> egress_global_timestamp;
    bit<32> enq_timestamp;
    bit<19> enq_qdepth;
    bit<32> deq_timedelta;
    bit<19> deq_qdepth;
    bit<2> payload;
}

header ipv4_t {
    bit<4>    version;
    bit<4>    ihl;
    bit<8>    diffserv;
    bit<16>   totalLen;
    bit<16>   identification;
    bit<3>    flags;
    bit<13>   fragOffset;
    bit<8>    ttl;
    bit<8>    protocol;
    bit<16>   hdrChecksum;
    ip4Addr_t srcAddr;
    ip4Addr_t dstAddr;
}

struct metadata {
    /* empty */
}

struct headers {
    ethernet_t   ethernet;
    int_t        telemetry;
    ipv4_t       ipv4;
}

/*************************************************************************
*********************** P A R S E R  ***********************************
*************************************************************************/

parser MyParser(packet_in packet,
                out headers hdr,
                inout metadata meta,
                inout standard_metadata_t standard_metadata) {

    state start {
		packet.extract(hdr.ethernet);
		transition select (hdr.ethernet.etherType) {
			TYPE_IPV4: parse_ipv4;
            TYPE_INT: parse_int;
			default: accept;
		}
    }

    state parse_ipv4 {
		packet.extract(hdr.ipv4);
		transition select (hdr.ipv4.protocol) {
            default: accept;
        }
    }

    state parse_int {
        packet.extract(hdr.telemetry);
        transition select (hdr.telemetry.protocol) {
            default: accept;
        }
    }
}


/*************************************************************************
************   C H E C K S U M    V E R I F I C A T I O N   *************
*************************************************************************/

control MyVerifyChecksum(inout headers hdr, inout metadata meta) {
    apply {  }
}


/*************************************************************************
**************  I N G R E S S   P R O C E S S I N G   *******************
*************************************************************************/

control MyIngress(inout headers hdr,
                  inout metadata meta,
                  inout standard_metadata_t standard_metadata) {

    register<bit<32>>(MAX_REG_SIZE) byte_cnt_reg;
    register<time_t>(MAX_REG_SIZE) last_time_reg;

    action drop() {
        mark_to_drop(standard_metadata);
    }

    action ipv4_forward(macAddr_t macAddr, egressSpec_t port) {
        hdr.ethernet.dstAddr = macAddr;
		standard_metadata.egress_spec = port;
		hdr.ipv4.ttl = hdr.ipv4.ttl - 1;
        standard_metadata.priority = 0;
    }

    table ipv4_lpm {
        key = {
			hdr.ipv4.dstAddr: lpm;
        }
        actions = {
            ipv4_forward;
            drop;
            NoAction;
        }
        size = 1024;
        default_action = drop;
    }

	action source_drop(bit<7> probability) {
		bit<7> rand;
		random(rand, 0, 100);
		if (rand <= probability) {
			drop();
		}
	}

    action limit_bandwidth(bit<32> bandwidth) {
        bit<32> byte_cnt;
        bit<32> hashed;
        time_t last_time;
        hash(hashed, HashAlgorithm.crc32, (bit<32>) 0,
            { hdr.ipv4.srcAddr }, (bit<32>) MAX_REG_SIZE);
        byte_cnt_reg.read(byte_cnt, hashed);
        last_time_reg.read(last_time, hashed);
        time_t cur_time = standard_metadata.ingress_global_timestamp;
        if ((byte_cnt >> 10) >= bandwidth) {
            drop();
        } else if ((byte_cnt >> 10) >= (bandwidth >> 1)) {
            standard_metadata.priority = 3;
        }
        if (cur_time - last_time >= 1000000) {
            byte_cnt = 0;
            last_time_reg.write(hashed, cur_time);
        }
        byte_cnt = byte_cnt + standard_metadata.packet_length;
        byte_cnt_reg.write(hashed, byte_cnt);
    }

	table source_filter_exact {
		key = {
			hdr.ipv4.srcAddr: exact;
		}
		actions = {
			source_drop;
            limit_bandwidth;
			NoAction;
		}
		size = 1024;
		default_action = NoAction();
	}

    apply {
		if (hdr.ipv4.isValid() && standard_metadata.instance_type == 0) {
			ipv4_lpm.apply();
            source_filter_exact.apply();

            bit<7> rand;
            random(rand, 0, 100);
            if (rand <= 10)
                clone(CloneType.I2E, CPU_MIRROR);
		}
    }
}

/*************************************************************************
****************  E G R E S S   P R O C E S S I N G   *******************
*************************************************************************/

control MyEgress(inout headers hdr,
                 inout metadata meta,
                 inout standard_metadata_t standard_metadata) {

	action drop() {
		mark_to_drop(standard_metadata);
	}

	action rewrite_smac(macAddr_t macAddr) {
		hdr.ethernet.srcAddr = macAddr;
	}

	table smac_exact {
		key = {
			standard_metadata.egress_port: exact;
		}
		actions = {
			rewrite_smac;
			NoAction;
		}
		size = 1024;
		default_action = NoAction();
	}

    apply {
        smac_exact.apply();

		if (standard_metadata.mcast_grp != 0 && standard_metadata.egress_port == standard_metadata.ingress_port) {
			drop();
		}
        
        if (standard_metadata.instance_type != 0) {
	    hdr.ethernet.setInvalid();
	    hdr.ipv4.setInvalid();

            hdr.telemetry.setValid();
            hdr.telemetry.protocol = TYPE_IPV4;
            hdr.telemetry.ingress_global_timestamp = standard_metadata.ingress_global_timestamp;
            hdr.telemetry.egress_global_timestamp = standard_metadata.egress_global_timestamp;
            hdr.telemetry.enq_timestamp = standard_metadata.enq_timestamp;
            hdr.telemetry.enq_qdepth = standard_metadata.enq_qdepth;
            hdr.telemetry.deq_timedelta = standard_metadata.deq_timedelta;
            hdr.telemetry.deq_qdepth = standard_metadata.deq_qdepth;
            hdr.telemetry.payload = (bit<2>) 0;
        }
	}
}

/*************************************************************************
*************   C H E C K S U M    C O M P U T A T I O N   **************
*************************************************************************/

control MyComputeChecksum(inout headers hdr, inout metadata meta) {
     apply {
        update_checksum(
            hdr.ipv4.isValid(),
            { hdr.ipv4.version,
              hdr.ipv4.ihl,
              hdr.ipv4.diffserv,
              hdr.ipv4.totalLen,
              hdr.ipv4.identification,
              hdr.ipv4.flags,
              hdr.ipv4.fragOffset,
              hdr.ipv4.ttl,
              hdr.ipv4.protocol,
              hdr.ipv4.srcAddr,
              hdr.ipv4.dstAddr },
            hdr.ipv4.hdrChecksum,
            HashAlgorithm.csum16);
    }
}


/*************************************************************************
***********************  D E P A R S E R  *******************************
*************************************************************************/

control MyDeparser(packet_out packet, in headers hdr) {
    apply {
		packet.emit(hdr.ethernet);
		packet.emit(hdr.ipv4);
        //packet.emit(hdr.telemetry);      // Should be this instruction uncommented?
    }
}

/*************************************************************************
***********************  S W I T C H  *******************************
*************************************************************************/

V1Switch(
MyParser(),
MyVerifyChecksum(),
MyIngress(),
MyEgress(),
MyComputeChecksum(),
MyDeparser()
) main;

My current control plane implementation (written using p4runtime_sh Python wrapper: I haven’t found any implementation of PRE & Clone instructions in runtime_CLI (I need to use a P4Runtime-compatible control plane software, so simple_switch_CLI is out of order to me), is this the case? Are there any better alternatives, using directly the p4runtime_lib without the shell wrapper?) is this one:

import os
import argparse
import p4runtime_sh.shell as sh

def main(p4info, bmv2_json, server_addr, install):
    if install:
        sh.setup(device_id=0, grpc_addr=server_addr, election_id=(0,1), config=sh.FwdPipeConfig(p4info, bmv2_json))
    else:
        sh.setup(device_id=0, grpc_addr=server_addr, election_id=(0,1))

    # -----------------------------------------------------------
    # ------------------------- ENTRIES -------------------------
    # -----------------------------------------------------------

    sh.CloneSessionEntry(100).add(142, 1).insert()

    mac_addresses = {
        '1' : 'aa:ae:f3:59:16:88',
        '2' : 'da:81:d8:cc:55:2a',
        '3' : 'ba:f3:70:fa:d5:8f'
    }

    for port, mac in mac_addresses.items():
        te = sh.TableEntry('MyEgress.smac_exact')(action='MyEgress.rewrite_smac')
        te.match['standard_metadata.egress_port'] = port
        te.action['macAddr'] = mac
        te.insert()

    dst_mac_addresses = {
        '10.0.1.1': '02:f0:db:29:0e:f6',
        '10.0.2.1': 'ee:05:1f:41:be:aa',
        '10.0.3.1': 'fe:1d:8a:77:01:04'
    }
    
    for i in range(1, 4):           // Yes, I know this is poorly written
        dst_ip = '10.0.' + str(i) + '.1'
        te = sh.TableEntry('MyIngress.ipv4_lpm')(action='MyIngress.ipv4_forward')
        te.match['hdr.ipv4.dstAddr'] = dst_ip + '/24'
        te.action['macAddr'] = dst_mac_addresses[dst_ip]
        te.action['port'] = str(i)
        te.insert()


    # -----------------------------------------------------------
    # ---------------------RECEIVING INT-------------------------

    header_dict = {
        '1': 'Protocol',
        '2': 'Ingress Timestamp',
        '3': 'Egress Timestamp',
        '4': 'Enqueue Timestamp',
        '5': 'Enqueue Queue Depth',
        '6': 'Time in queue',
        '7': 'Dequeue Queue Depth',
        '8': 'Payload'
    }

    pkt_in = sh.PacketIn()

    while True:
        for pkt in pkt_in.sniff(timeout=1):
            print('----- PACKET------')
            for md in pkt.packet.metadata:
                print(header_dict.get(str(md.metadata_id)) + ': ', end='')
                print(str(int.from_bytes(md.value, byteorder='big')))

    sh.teardown()

    # -----------------------------------------------------------

if __name__ == '__main__':
    base_path = os.getcwd().split('/')[-1]

    parser = argparse.ArgumentParser(description="P4 Runtime Controller written in Python to support P4 dataplane programming")
    parser.add_argument('-a', '--address', type=str, default='localhost:50051', help='Address of the switch, in the format <IP>:<PORT>')
    parser.add_argument('-i', '--install', action='store_true', help='If specified, installs the pipeline in the dataplane')
    parser.add_argument('--p4info', help='P4Info file in text format from p4c', type=str, action='store', required=False, default='./' + base_path + '.p4info.txtpb')
    parser.add_argument('--bmv2-json', help='BMv2 JSON file from p4c', type=str, action="store", required=False, default="./" + base_path + ".json")
    
    args = parser.parse_args()

    if not os.path.exists(args.p4info):
        parser.print_help()
        print('\nP4Info file not found: %s\nHave you compiled with p4c?' % args.p4info)
        parser.exit(1)
    if not os.path.exists(args.bmv2_json):
        parser.print_help()
        print('\nBMv2 JSON file not found :%s\nHave you compiled with p4c?' % args.bmv2_json)
        parser.exit(1)

    main(args.p4info, args.bmv2_json, args.address, args.install)

However, you can see I am writing always the same and known values for the “protocol” and “payload” fields in the controller header metadata for packet_in, but when reading them from the control plane they are respectively “2048” or “0” (one time it is the first, then it is the latter), and “2”: therefore, I strongly suppose even the other values are wrong (in particular, the dequeue timedelta seems to change a lot during light load in the network traffic, and the ingress/egress timestamps seem very high even considering they are in microseconds).
I do not think it is a problem in the write phase, because traffic is flowing as intended and everything is working.

Moreover, the CloneSessionEntry().add().insert() seems to have an “append” semantics rather than a “write” semantic in the entries of the dataplane: if I execute this Python script multiple times without restarting the software switch, I get N packet_in instances for incoming packet, with N equal to the amount of executions of this Python script after the last restart of the switch. Is this intended behaviour?

To conclude, I have one more side question: are there any form of pre-parsing queues for incoming packets? How does work, for BMv2 software switch, the “pre-parsing phase”: in other words, how and where are packets stored before entering the user-defined pipeline?

Thank you all for your help!

EDIT: I am sorry, I forgot to mention the launch parameters of simple_switch_grpc:

sudo simple_switch_grpc --no-p4 -i 1@ens19 -i 2@ens20 -i 3@ens21 --log-console -- --grpc-server-addr localhost:50051 --cpu-port 142 --priority-queues 4

EDIT 2: changing the Python script code to only print the packets captured through the sniff() function of PacketIn(), the output is the following:

Here is a code snippet that is part of a small test Python P4Runtime API client program that uses p4runtime-shell APIs to configure a multicast group: p4-guide/demo7/ptf/demo7.py at master · jafingerhut/p4-guide · GitHub
The P4 program that it corresponds with is in the directory above that.

Here is a code snippet that configures a clone session with a list of output ports to go to. That Python program always only configures it with a list of length 1. I do not recall if I have ever tried to test a situation where I configured the clone session to be replicated more than once: p4-guide/flowcache/controller.py at master · jafingerhut/p4-guide · GitHub

I have not tried to change the configuration of a clone session from making one or more copies, to making 0 copies, so I have no idea whether p4runtime-shell does that correctly or not.

Here is a test program that uses PacketIn and PacketOut with some simple controller metadata headers, but they are not the same metadata fields that you want. At least it is a working example you could study to see if you are doing anything significantly different: p4-guide/ptf-tests/packetinout at master · jafingerhut/p4-guide · GitHub

I believe that when packets arrive to a BMv2 port, it stores them in memory in a FIFO queue of packets before they begin executing the user-defined pipeline. I do not know how deep that queue can get. In general, if you are hoping that you can use BMv2 to simulate the performance behavior of a hardware switch ASIC, I think you are hoping for something that will be impossible to achieve without making noticeable changes to BMv2 source code.

Thank you for taking your time to answer, it was very helpful.

I fixed the wrong values problem by emitting (in the deparser) my INT header, in particular as the first header, before the others. Maybe I’ll try emitting it after the Ethernet and IPv4 headers to see what happens (it depends on how the control plane interprets the packet, I think).

I also changed the replication_id in the CloneSessionEntry from 1 to 0, I still don’t know what changes (and whether it matters for the correct execution flow of the program) but I’ll refer to the P4Runtime specifications to further investigate.

Then I have two questions, but a bit out-of-topic:

  1. Is there any plan to support some form of PRE &/|| Clone operations in the runtime_CLI?
  2. I do know the performance of BMv2 is very limited (I’m even executing it in a VM), but is there anyway from its architecture I can do to manage traffic surges? My limit_bandwidth Ingress action limits the bandwidth to a certain threshold for a specific IP address, but when using it for a client that sends a lot of iperf3 traffic to a server and another client pinging the same server (in a star topology, with only one switch with each port connected to a single of these three hosts [client1, client2, server]) I get worse latency in the ping than when not activating the bandwidth limiting mechanism: is that the software that is too slow executing the instructions required for bandwidth limiting? Is there any “conceptual” workaround to this problematic (and so without getting proper, better hardware)?

Thank you so much for your help, flagging the problem as solved, but if you can answer these two final questions it would be very much appreciated.

I have not checked the implementation to be certain, but it would surprise me if the controller metadata header were supported anywhere other than as the first head of a packet sent to the CPU port. Trying to support it in any other place in the packet leads to the question “how does the control plane software know where to find it?”

The replication_id becomes the value of a metadata field called egress_rid in the v1model architecture p4c/p4include/v1model.p4 at main · p4lang/p4c · GitHub. The intent is that if you want to send more than one copy of a packet to the same output port, your P4 program can use the value of egress_rid to distinguish the multiple copies to the same output port, and choose to do something different with each copy if you wish. If you do not need that capability, your P4 program can simply ignore the value of egress_rid.

The runtime_CLI command might not support these configuration commands, but simple_switch_CLI does support configuration of clone sessions (they are called mirror sessions there – just another name for a clone session): behavioral-model/docs/runtime_CLI.md at main · p4lang/behavioral-model · GitHub

The multicast group configuration command documentation can be found here: behavioral-model/docs/runtime_CLI.md at main · p4lang/behavioral-model · GitHub

If the performance issues you are seeing are due to a large number of packets being stored in BMv2’s FIFO queue of packets that have been received from an input port, and waiting to be processed by your P4 program, I can think of two possible small changes you could try:

(a) modify the BMv2 code to reduce the maximum depth of that FIFO queue down to some small number of pakets, e.g. 1 to 4 or so. Note that this would likely have the effect that if packets are arriving to BMv2 faster than it can do ingress processing on them, likely those packets would simply be dropped after arriving on that input port.

(b) Do some kind of rate limiting on the interfaces upstream of the BMv2 switch, to slow down the rate at which packets are sent to the BMv2 switch.

Thank you very much for your help, it has been helpful.

I have added some notes about the CPU port, and PacketIn/PacketOut messages, to my example packetinout P4 program, here: p4-guide/ptf-tests/packetinout at master · jafingerhut/p4-guide · GitHub