Rebase to 2.10.1

f38
Timothy Redaelli 6 years ago
parent 9aaef75e21
commit 8fa9ef1e16

@ -1,218 +0,0 @@
From b37f8c15ca6ee079541b0c02ee77ce9d392b18fc Mon Sep 17 00:00:00 2001
Message-Id: <b37f8c15ca6ee079541b0c02ee77ce9d392b18fc.1538494812.git.lorenzo.bianconi@redhat.com>
In-Reply-To: <cover.1538494812.git.lorenzo.bianconi@redhat.com>
References: <cover.1538494812.git.lorenzo.bianconi@redhat.com>
From: Lorenzo Bianconi <lorenzo.bianconi@redhat.com>
Date: Thu, 20 Sep 2018 16:46:02 +0200
Subject: [PATCH] OVN: add CT_LB action to ovn-trace
Add CT_LB action to ovn-trace utility in order to fix the
following ovn-trace error if a load balancer rule is added to
OVN configuration
ct_next(ct_state=est|trk /* default (use --ct to customize) */) {
*** ct_lb action not implemented;
};
Add '--lb_dst' option in order to specify the ip address to use
in VIP pool. If --lb_dst is not provided the destination ip will be
randomly choosen
Signed-off-by: Lorenzo Bianconi <lorenzo.bianconi@redhat.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>
---
ovn/utilities/ovn-trace.8.xml | 18 ++++++-
ovn/utilities/ovn-trace.c | 98 +++++++++++++++++++++++++++++++++--
2 files changed, 111 insertions(+), 5 deletions(-)
--- a/ovn/utilities/ovn-trace.8.xml
+++ b/ovn/utilities/ovn-trace.8.xml
@@ -253,9 +253,17 @@
<code>ct_snat</code>) action.
</dd>
- <dt><code>ct_lb</code></dt>
+ <dt><code>ct_lb;</code></dt>
+ <dt><code>ct_lb(<var>ip</var></code>[<code>:<var>port</var></code>]...<code>);</code></dt>
<dd>
- Not yet implemented; currently implemented as a no-op.
+ Forks the pipeline. In one fork, sets <code>ip4.dst</code> (or
+ <code>ip6.dst</code>) to one of the load-balancer addresses and the
+ destination port to its associated port, if any, and sets
+ <code>ct.dnat</code> to 1. With one or more arguments, gives preference
+ to the address specified on <code>--lb-dst</code>, if any; without
+ arguments, uses the address and port specified on <code>--lb-dst</code>.
+ In the other fork, the pipeline continues without change after the
+ <code>ct_lb</code> action.
</dd>
<dt><code>ct_commit</code></dt>
@@ -424,6 +432,12 @@
</p>
</dd>
+ <dt><code>--lb-dst=</code><var>ip</var>[<code>:<var>port</var></code>]</dt>
+ <dd>
+ Sets the IP from VIP pool to use as destination of the packet.
+ <code>--lb-dst</code> is not available in daemon mode.
+ </dd>
+
<dt><code>--friendly-names</code></dt>
<dt><code>--no-friendly-names</code></dt>
<dd>
--- a/ovn/utilities/ovn-trace.c
+++ b/ovn/utilities/ovn-trace.c
@@ -46,6 +46,7 @@
#include "stream.h"
#include "unixctl.h"
#include "util.h"
+#include "random.h"
VLOG_DEFINE_THIS_MODULE(ovntrace);
@@ -77,6 +78,9 @@ static uint32_t *ct_states;
static size_t n_ct_states;
static size_t ct_state_idx;
+/* --lb-dst: load balancer destination info. */
+static struct ovnact_ct_lb_dst lb_dst;
+
/* --friendly-names, --no-friendly-names: Whether to substitute human-friendly
* port and datapath names for the awkward UUIDs typically used in the actual
* logical flows. */
@@ -187,6 +191,24 @@ parse_ct_option(const char *state_s_)
}
static void
+parse_lb_option(const char *s)
+{
+ struct sockaddr_storage ss;
+ if (!inet_parse_active(s, 0, &ss)) {
+ ovs_fatal(0, "%s: bad address", s);
+ }
+
+ lb_dst.family = ss.ss_family;
+ struct in6_addr a = ss_get_address(&ss);
+ if (ss.ss_family == AF_INET) {
+ lb_dst.ipv4 = in6_addr_get_mapped_ipv4(&a);
+ } else {
+ lb_dst.ipv6 = a;
+ }
+ lb_dst.port = ss_get_port(&ss);
+}
+
+static void
parse_options(int argc, char *argv[])
{
enum {
@@ -202,7 +224,8 @@ parse_options(int argc, char *argv[])
OPT_NO_FRIENDLY_NAMES,
DAEMON_OPTION_ENUMS,
SSL_OPTION_ENUMS,
- VLOG_OPTION_ENUMS
+ VLOG_OPTION_ENUMS,
+ OPT_LB_DST
};
static const struct option long_options[] = {
{"db", required_argument, NULL, OPT_DB},
@@ -217,6 +240,7 @@ parse_options(int argc, char *argv[])
{"no-friendly-names", no_argument, NULL, OPT_NO_FRIENDLY_NAMES},
{"help", no_argument, NULL, 'h'},
{"version", no_argument, NULL, 'V'},
+ {"lb-dst", required_argument, NULL, OPT_LB_DST},
DAEMON_LONG_OPTIONS,
VLOG_LONG_OPTIONS,
STREAM_SSL_LONG_OPTIONS,
@@ -274,6 +298,10 @@ parse_options(int argc, char *argv[])
use_friendly_names = false;
break;
+ case OPT_LB_DST:
+ parse_lb_option(optarg);
+ break;
+
case 'h':
usage();
@@ -1823,6 +1851,71 @@ execute_ct_nat(const struct ovnact_ct_na
}
static void
+execute_ct_lb(const struct ovnact_ct_lb *ct_lb,
+ const struct ovntrace_datapath *dp, struct flow *uflow,
+ enum ovnact_pipeline pipeline, struct ovs_list *super)
+{
+ struct flow ct_lb_flow = *uflow;
+
+ int family = (ct_lb_flow.dl_type == htons(ETH_TYPE_IP) ? AF_INET
+ : ct_lb_flow.dl_type == htons(ETH_TYPE_IPV6) ? AF_INET6
+ : AF_UNSPEC);
+ if (family != AF_UNSPEC) {
+ const struct ovnact_ct_lb_dst *dst = NULL;
+ if (ct_lb->n_dsts) {
+ /* For ct_lb with addresses, choose one of the addresses. */
+ int n = 0;
+ for (int i = 0; i < ct_lb->n_dsts; i++) {
+ const struct ovnact_ct_lb_dst *d = &ct_lb->dsts[i];
+ if (d->family != family) {
+ continue;
+ }
+
+ /* Check for the destination specified by --lb-dst, if any. */
+ if (lb_dst.family == family
+ && (family == AF_INET
+ ? d->ipv4 == lb_dst.ipv4
+ : ipv6_addr_equals(&d->ipv6, &lb_dst.ipv6))) {
+ lb_dst.family = AF_UNSPEC;
+ dst = d;
+ break;
+ }
+
+ /* Select a random destination as a fallback. */
+ if (!random_range(++n)) {
+ dst = d;
+ }
+ }
+
+ if (!dst) {
+ ovntrace_node_append(super, OVNTRACE_NODE_ERROR,
+ "*** no load balancing destination "
+ "(use --lb-dst)");
+ }
+ } else if (lb_dst.family == family) {
+ /* For ct_lb without addresses, use user-specified address. */
+ dst = &lb_dst;
+ }
+
+ if (dst) {
+ if (family == AF_INET6) {
+ ct_lb_flow.ipv6_dst = dst->ipv6;
+ } else {
+ ct_lb_flow.nw_dst = dst->ipv4;
+ }
+ if (dst->port) {
+ ct_lb_flow.tp_dst = htons(dst->port);
+ }
+ ct_lb_flow.ct_state |= CS_DST_NAT;
+ }
+ }
+
+ struct ovntrace_node *node = ovntrace_node_append(
+ super, OVNTRACE_NODE_TRANSFORMATION, "ct_lb");
+ trace__(dp, &ct_lb_flow, ct_lb->ltable, pipeline, &node->subs);
+}
+
+static void
execute_log(const struct ovnact_log *log, struct flow *uflow,
struct ovs_list *super)
{
@@ -1910,8 +2003,7 @@ trace_actions(const struct ovnact *ovnac
break;
case OVNACT_CT_LB:
- ovntrace_node_append(super, OVNTRACE_NODE_ERROR,
- "*** ct_lb action not implemented");
+ execute_ct_lb(ovnact_get_CT_LB(a), dp, uflow, pipeline, super);
break;
case OVNACT_CT_CLEAR:

@ -1,228 +0,0 @@
From 769b50349f28c5f9e4bff102bc61dadcb9b99c37 Mon Sep 17 00:00:00 2001
From: Ben Pfaff <blp@ovn.org>
Date: Tue, 25 Sep 2018 15:14:13 -0700
Subject: [PATCH] dpif: Remove support for multiple queues per port.
Commit 69c51582ff78 ("dpif-netlink: don't allocate per thread netlink
sockets") removed dpif-netlink support for multiple queues per port.
No remaining dpif provider supports multiple queues per port, so
remove infrastructure for the feature.
CC: Matteo Croce <mcroce@redhat.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>
Tested-by: Yifeng Sun <pkusunyifeng@gmail.com>
Reviewed-by: Yifeng Sun <pkusunyifeng@gmail.com>
---
lib/dpif-netlink.c | 9 ++++-----
lib/dpif-provider.h | 14 ++------------
lib/dpif.c | 15 +++------------
lib/dpif.h | 15 +--------------
ofproto/ofproto-dpif-upcall.c | 7 +++----
ofproto/ofproto-dpif-xlate.c | 6 ++----
6 files changed, 15 insertions(+), 51 deletions(-)
diff --git a/lib/dpif-netlink.c b/lib/dpif-netlink.c
index 4736d21d4..21315033c 100644
--- a/lib/dpif-netlink.c
+++ b/lib/dpif-netlink.c
@@ -234,7 +234,7 @@ static bool ovs_tunnels_out_of_tree = true;
static int dpif_netlink_init(void);
static int open_dpif(const struct dpif_netlink_dp *, struct dpif **);
static uint32_t dpif_netlink_port_get_pid(const struct dpif *,
- odp_port_t port_no, uint32_t hash);
+ odp_port_t port_no);
static void dpif_netlink_handler_uninit(struct dpif_handler *handler);
static int dpif_netlink_refresh_channels(struct dpif_netlink *,
uint32_t n_handlers);
@@ -991,7 +991,7 @@ dpif_netlink_port_query_by_name(const struct dpif *dpif_, const char *devname,
static uint32_t
dpif_netlink_port_get_pid__(const struct dpif_netlink *dpif,
- odp_port_t port_no, uint32_t hash OVS_UNUSED)
+ odp_port_t port_no)
OVS_REQ_RDLOCK(dpif->upcall_lock)
{
uint32_t port_idx = odp_to_u32(port_no);
@@ -1015,14 +1015,13 @@ dpif_netlink_port_get_pid__(const struct dpif_netlink *dpif,
}
static uint32_t
-dpif_netlink_port_get_pid(const struct dpif *dpif_, odp_port_t port_no,
- uint32_t hash)
+dpif_netlink_port_get_pid(const struct dpif *dpif_, odp_port_t port_no)
{
const struct dpif_netlink *dpif = dpif_netlink_cast(dpif_);
uint32_t ret;
fat_rwlock_rdlock(&dpif->upcall_lock);
- ret = dpif_netlink_port_get_pid__(dpif, port_no, hash);
+ ret = dpif_netlink_port_get_pid__(dpif, port_no);
fat_rwlock_unlock(&dpif->upcall_lock);
return ret;
diff --git a/lib/dpif-provider.h b/lib/dpif-provider.h
index debdafc42..eb3ee50a6 100644
--- a/lib/dpif-provider.h
+++ b/lib/dpif-provider.h
@@ -191,16 +191,7 @@ struct dpif_class {
/* Returns the Netlink PID value to supply in OVS_ACTION_ATTR_USERSPACE
* actions as the OVS_USERSPACE_ATTR_PID attribute's value, for use in
- * flows whose packets arrived on port 'port_no'. In the case where the
- * provider allocates multiple Netlink PIDs to a single port, it may use
- * 'hash' to spread load among them. The caller need not use a particular
- * hash function; a 5-tuple hash is suitable.
- *
- * (The datapath implementation might use some different hash function for
- * distributing packets received via flow misses among PIDs. This means
- * that packets received via flow misses might be reordered relative to
- * packets received via userspace actions. This is not ordinarily a
- * problem.)
+ * flows whose packets arrived on port 'port_no'.
*
* A 'port_no' of UINT32_MAX should be treated as a special case. The
* implementation should return a reserved PID, not allocated to any port,
@@ -212,8 +203,7 @@ struct dpif_class {
*
* A dpif provider that doesn't have meaningful Netlink PIDs can use NULL
* for this function. This is equivalent to always returning 0. */
- uint32_t (*port_get_pid)(const struct dpif *dpif, odp_port_t port_no,
- uint32_t hash);
+ uint32_t (*port_get_pid)(const struct dpif *dpif, odp_port_t port_no);
/* Attempts to begin dumping the ports in a dpif. On success, returns 0
* and initializes '*statep' with any data needed for iteration. On
diff --git a/lib/dpif.c b/lib/dpif.c
index 85cf9000e..4697a4dcd 100644
--- a/lib/dpif.c
+++ b/lib/dpif.c
@@ -737,16 +737,7 @@ dpif_port_query_by_name(const struct dpif *dpif, const char *devname,
/* Returns the Netlink PID value to supply in OVS_ACTION_ATTR_USERSPACE
* actions as the OVS_USERSPACE_ATTR_PID attribute's value, for use in
- * flows whose packets arrived on port 'port_no'. In the case where the
- * provider allocates multiple Netlink PIDs to a single port, it may use
- * 'hash' to spread load among them. The caller need not use a particular
- * hash function; a 5-tuple hash is suitable.
- *
- * (The datapath implementation might use some different hash function for
- * distributing packets received via flow misses among PIDs. This means
- * that packets received via flow misses might be reordered relative to
- * packets received via userspace actions. This is not ordinarily a
- * problem.)
+ * flows whose packets arrived on port 'port_no'.
*
* A 'port_no' of ODPP_NONE is a special case: it returns a reserved PID, not
* allocated to any port, that the client may use for special purposes.
@@ -757,10 +748,10 @@ dpif_port_query_by_name(const struct dpif *dpif, const char *devname,
* update all of the flows that it installed that contain
* OVS_ACTION_ATTR_USERSPACE actions. */
uint32_t
-dpif_port_get_pid(const struct dpif *dpif, odp_port_t port_no, uint32_t hash)
+dpif_port_get_pid(const struct dpif *dpif, odp_port_t port_no)
{
return (dpif->dpif_class->port_get_pid
- ? (dpif->dpif_class->port_get_pid)(dpif, port_no, hash)
+ ? (dpif->dpif_class->port_get_pid)(dpif, port_no)
: 0);
}
diff --git a/lib/dpif.h b/lib/dpif.h
index 8fdfe5f00..1a35cc410 100644
--- a/lib/dpif.h
+++ b/lib/dpif.h
@@ -274,18 +274,6 @@
*
* - Upcalls that specify the "special" Netlink PID are queued separately.
*
- * Multiple threads may want to read upcalls simultaneously from a single
- * datapath. To support multiple threads well, one extends the above preferred
- * behavior:
- *
- * - Each port has multiple PIDs. The datapath distributes "miss" upcalls
- * across the PIDs, ensuring that a given flow is mapped in a stable way
- * to a single PID.
- *
- * - For "action" upcalls, the thread can specify its own Netlink PID or
- * other threads' Netlink PID of the same port for offloading purpose
- * (e.g. in a "round robin" manner).
- *
*
* Packet Format
* =============
@@ -470,8 +458,7 @@ int dpif_port_query_by_name(const struct dpif *, const char *devname,
struct dpif_port *);
int dpif_port_get_name(struct dpif *, odp_port_t port_no,
char *name, size_t name_size);
-uint32_t dpif_port_get_pid(const struct dpif *, odp_port_t port_no,
- uint32_t hash);
+uint32_t dpif_port_get_pid(const struct dpif *, odp_port_t port_no);
struct dpif_port_dump {
const struct dpif *dpif;
diff --git a/ofproto/ofproto-dpif-upcall.c b/ofproto/ofproto-dpif-upcall.c
index 62222079f..0cc964a7f 100644
--- a/ofproto/ofproto-dpif-upcall.c
+++ b/ofproto/ofproto-dpif-upcall.c
@@ -1021,7 +1021,6 @@ classify_upcall(enum dpif_upcall_type type, const struct nlattr *userdata,
* initialized with at least 128 bytes of space. */
static void
compose_slow_path(struct udpif *udpif, struct xlate_out *xout,
- const struct flow *flow,
odp_port_t odp_in_port, ofp_port_t ofp_in_port,
struct ofpbuf *buf, uint32_t meter_id,
struct uuid *ofproto_uuid)
@@ -1038,7 +1037,7 @@ compose_slow_path(struct udpif *udpif, struct xlate_out *xout,
port = xout->slow & (SLOW_CFM | SLOW_BFD | SLOW_LACP | SLOW_STP)
? ODPP_NONE
: odp_in_port;
- pid = dpif_port_get_pid(udpif->dpif, port, flow_hash_5tuple(flow, 0));
+ pid = dpif_port_get_pid(udpif->dpif, port);
size_t offset;
size_t ac_offset;
@@ -1196,7 +1195,7 @@ upcall_xlate(struct udpif *udpif, struct upcall *upcall,
odp_actions->data, odp_actions->size);
} else {
/* upcall->put_actions already initialized by upcall_receive(). */
- compose_slow_path(udpif, &upcall->xout, upcall->flow,
+ compose_slow_path(udpif, &upcall->xout,
upcall->flow->in_port.odp_port, upcall->ofp_in_port,
&upcall->put_actions,
upcall->ofproto->up.slowpath_meter_id,
@@ -2155,7 +2154,7 @@ revalidate_ukey__(struct udpif *udpif, const struct udpif_key *ukey,
goto exit;
}
- compose_slow_path(udpif, xoutp, &ctx.flow, ctx.flow.in_port.odp_port,
+ compose_slow_path(udpif, xoutp, ctx.flow.in_port.odp_port,
ofp_in_port, odp_actions,
ofproto->up.slowpath_meter_id, &ofproto->uuid);
}
diff --git a/ofproto/ofproto-dpif-xlate.c b/ofproto/ofproto-dpif-xlate.c
index 6949595ba..f11f60468 100644
--- a/ofproto/ofproto-dpif-xlate.c
+++ b/ofproto/ofproto-dpif-xlate.c
@@ -3084,8 +3084,7 @@ compose_sample_action(struct xlate_ctx *ctx,
odp_port_t odp_port = ofp_port_to_odp_port(
ctx->xbridge, ctx->xin->flow.in_port.ofp_port);
- uint32_t pid = dpif_port_get_pid(ctx->xbridge->dpif, odp_port,
- flow_hash_5tuple(&ctx->xin->flow, 0));
+ uint32_t pid = dpif_port_get_pid(ctx->xbridge->dpif, odp_port);
size_t cookie_offset = odp_put_userspace_action(pid, cookie,
sizeof *cookie,
tunnel_out_port,
@@ -4638,8 +4637,7 @@ put_controller_user_action(struct xlate_ctx *ctx,
odp_port_t odp_port = ofp_port_to_odp_port(ctx->xbridge,
ctx->xin->flow.in_port.ofp_port);
- uint32_t pid = dpif_port_get_pid(ctx->xbridge->dpif, odp_port,
- flow_hash_5tuple(&ctx->xin->flow, 0));
+ uint32_t pid = dpif_port_get_pid(ctx->xbridge->dpif, odp_port);
odp_put_userspace_action(pid, &cookie, sizeof cookie, ODPP_NONE,
false, ctx->odp_actions);
}
--
2.17.1

@ -1,310 +0,0 @@
From 57ce73db12f6d3e980c0b285015c998183f26c8d Mon Sep 17 00:00:00 2001
From: Kevin Traynor <ktraynor@redhat.com>
Date: Fri, 31 Aug 2018 09:47:55 +0100
Subject: [PATCH] dpif-netdev: Add round-robin based rxq to pmd assignment.
Prior to OVS 2.9 automatic assignment of Rxqs to PMDs
(i.e. CPUs) was done by round-robin.
That was changed in OVS 2.9 to ordering the Rxqs based on
their measured processing cycles. This was to assign the
busiest Rxqs to different PMDs, improving aggregate
throughput.
For the most part the new scheme should be better, but
there could be situations where a user prefers a simple
round-robin scheme because Rxqs from a single port are
more likely to be spread across multiple PMDs, and/or
traffic is very bursty/unpredictable.
Add 'pmd-rxq-assign' config to allow a user to select
round-robin based assignment.
Signed-off-by: Kevin Traynor <ktraynor@redhat.com>
Acked-by: Eelco Chaudron <echaudro@redhat.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>
---
Documentation/topics/dpdk/pmd.rst | 33 +++++++++++++---
NEWS | 4 +-
lib/dpif-netdev.c | 83 +++++++++++++++++++++++++++++----------
tests/pmd.at | 12 +++++-
vswitchd/vswitch.xml | 24 +++++++++++
5 files changed, 126 insertions(+), 30 deletions(-)
diff --git a/Documentation/topics/dpdk/pmd.rst b/Documentation/topics/dpdk/pmd.rst
index 5f0671e..dd9172d 100644
--- a/Documentation/topics/dpdk/pmd.rst
+++ b/Documentation/topics/dpdk/pmd.rst
@@ -113,10 +113,15 @@ means that this thread will only poll the *pinned* Rx queues.
If ``pmd-rxq-affinity`` is not set for Rx queues, they will be assigned to PMDs
-(cores) automatically. Where known, the processing cycles that have been stored
-for each Rx queue will be used to assign Rx queue to PMDs based on a round
-robin of the sorted Rx queues. For example, take the following example, where
-there are five Rx queues and three cores - 3, 7, and 8 - available and the
-measured usage of core cycles per Rx queue over the last interval is seen to
-be:
+(cores) automatically.
+
+The algorithm used to automatically assign Rxqs to PMDs can be set by::
+
+ $ ovs-vsctl set Open_vSwitch . other_config:pmd-rxq-assign=<assignment>
+
+By default, ``cycles`` assignment is used where the Rxqs will be ordered by
+their measured processing cycles, and then be evenly assigned in descending
+order to PMDs based on an up/down walk of the PMDs. For example, where there
+are five Rx queues and three cores - 3, 7, and 8 - available and the measured
+usage of core cycles per Rx queue over the last interval is seen to be:
- Queue #0: 30%
@@ -132,4 +137,20 @@ The Rx queues will be assigned to the cores in the following order::
Core 8: Q3 (60%) | Q0 (30%)
+Alternatively, ``roundrobin`` assignment can be used, where the Rxqs are
+assigned to PMDs in a round-robined fashion. This algorithm was used by
+default prior to OVS 2.9. For example, given the following ports and queues:
+
+- Port #0 Queue #0 (P0Q0)
+- Port #0 Queue #1 (P0Q1)
+- Port #1 Queue #0 (P1Q0)
+- Port #1 Queue #1 (P1Q1)
+- Port #1 Queue #2 (P1Q2)
+
+The Rx queues may be assigned to the cores in the following order::
+
+ Core 3: P0Q0 | P1Q1
+ Core 7: P0Q1 | P1Q2
+ Core 8: P1Q0 |
+
To see the current measured usage history of PMD core cycles for each Rx
queue::
diff --git a/NEWS b/NEWS
index 04de807..87da271 100644
--- a/NEWS
+++ b/NEWS
@@ -43,6 +43,8 @@
* Allow init to fail and record DPDK status/version in OVS database.
* Add experimental flow hardware offload support
* Support both shared and per port mempools for DPDK devices.
+ * Add option for simple round-robin based Rxq to PMD assignment.
+ It can be set with pmd-rxq-assign.
- Userspace datapath:
* Commands ovs-appctl dpif-netdev/pmd-*-show can now work on a single PMD
* Detailed PMD performance metrics available with new command
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 52b5bc2..466d5ac 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -342,4 +342,6 @@ struct dp_netdev {
struct id_pool *tx_qid_pool;
struct ovs_mutex tx_qid_pool_mutex;
+ /* Use measured cycles for rxq to pmd assignment. */
+ bool pmd_rxq_assign_cyc;
/* Protects the access of the 'struct dp_netdev_pmd_thread'
@@ -1493,4 +1495,5 @@ create_dp_netdev(const char *name, const struct dpif_class *class,
cmap_init(&dp->poll_threads);
+ dp->pmd_rxq_assign_cyc = true;
ovs_mutex_init(&dp->tx_qid_pool_mutex);
@@ -3717,4 +3720,6 @@ dpif_netdev_set_config(struct dpif *dpif, const struct smap *other_config)
struct dp_netdev *dp = get_dp_netdev(dpif);
const char *cmask = smap_get(other_config, "pmd-cpu-mask");
+ const char *pmd_rxq_assign = smap_get_def(other_config, "pmd-rxq-assign",
+ "cycles");
unsigned long long insert_prob =
smap_get_ullong(other_config, "emc-insert-inv-prob",
@@ -3779,4 +3784,18 @@ dpif_netdev_set_config(struct dpif *dpif, const struct smap *other_config)
}
}
+
+ bool pmd_rxq_assign_cyc = !strcmp(pmd_rxq_assign, "cycles");
+ if (!pmd_rxq_assign_cyc && strcmp(pmd_rxq_assign, "roundrobin")) {
+ VLOG_WARN("Unsupported Rxq to PMD assignment mode in pmd-rxq-assign. "
+ "Defaulting to 'cycles'.");
+ pmd_rxq_assign_cyc = true;
+ pmd_rxq_assign = "cycles";
+ }
+ if (dp->pmd_rxq_assign_cyc != pmd_rxq_assign_cyc) {
+ dp->pmd_rxq_assign_cyc = pmd_rxq_assign_cyc;
+ VLOG_INFO("Rxq to PMD assignment mode changed to: \'%s\'.",
+ pmd_rxq_assign);
+ dp_netdev_request_reconfigure(dp);
+ }
return 0;
}
@@ -4249,8 +4268,16 @@ rr_numa_list_populate(struct dp_netdev *dp, struct rr_numa_list *rr)
}
-/* Returns the next pmd from the numa node in
- * incrementing or decrementing order. */
+/*
+ * Returns the next pmd from the numa node.
+ *
+ * If 'updown' is 'true' it will alternate between selecting the next pmd in
+ * either an up or down walk, switching between up/down when the first or last
+ * core is reached. e.g. 1,2,3,3,2,1,1,2...
+ *
+ * If 'updown' is 'false' it will select the next pmd wrapping around when last
+ * core reached. e.g. 1,2,3,1,2,3,1,2...
+ */
static struct dp_netdev_pmd_thread *
-rr_numa_get_pmd(struct rr_numa *numa)
+rr_numa_get_pmd(struct rr_numa *numa, bool updown)
{
int numa_idx = numa->cur_index;
@@ -4260,5 +4287,9 @@ rr_numa_get_pmd(struct rr_numa *numa)
if (numa->cur_index == numa->n_pmds-1) {
/* Reached the last pmd. */
- numa->idx_inc = false;
+ if (updown) {
+ numa->idx_inc = false;
+ } else {
+ numa->cur_index = 0;
+ }
} else {
numa->cur_index++;
@@ -4323,7 +4354,4 @@ compare_rxq_cycles(const void *a, const void *b)
* pmds to unpinned queues.
*
- * If 'pinned' is false queues will be sorted by processing cycles they are
- * consuming and then assigned to pmds in round robin order.
- *
* The function doesn't touch the pmd threads, it just stores the assignment
* in the 'pmd' member of each rxq. */
@@ -4338,4 +4366,5 @@ rxq_scheduling(struct dp_netdev *dp, bool pinned) OVS_REQUIRES(dp->port_mutex)
struct rr_numa *numa = NULL;
int numa_id;
+ bool assign_cyc = dp->pmd_rxq_assign_cyc;
HMAP_FOR_EACH (port, node, &dp->ports) {
@@ -4368,10 +4397,13 @@ rxq_scheduling(struct dp_netdev *dp, bool pinned) OVS_REQUIRES(dp->port_mutex)
rxqs = xrealloc(rxqs, sizeof *rxqs * (n_rxqs + 1));
}
- /* Sum the queue intervals and store the cycle history. */
- for (unsigned i = 0; i < PMD_RXQ_INTERVAL_MAX; i++) {
- cycle_hist += dp_netdev_rxq_get_intrvl_cycles(q, i);
- }
- dp_netdev_rxq_set_cycles(q, RXQ_CYCLES_PROC_HIST, cycle_hist);
+ if (assign_cyc) {
+ /* Sum the queue intervals and store the cycle history. */
+ for (unsigned i = 0; i < PMD_RXQ_INTERVAL_MAX; i++) {
+ cycle_hist += dp_netdev_rxq_get_intrvl_cycles(q, i);
+ }
+ dp_netdev_rxq_set_cycles(q, RXQ_CYCLES_PROC_HIST,
+ cycle_hist);
+ }
/* Store the queue. */
rxqs[n_rxqs++] = q;
@@ -4380,5 +4412,5 @@ rxq_scheduling(struct dp_netdev *dp, bool pinned) OVS_REQUIRES(dp->port_mutex)
}
- if (n_rxqs > 1) {
+ if (n_rxqs > 1 && assign_cyc) {
/* Sort the queues in order of the processing cycles
* they consumed during their last pmd interval. */
@@ -4404,5 +4436,5 @@ rxq_scheduling(struct dp_netdev *dp, bool pinned) OVS_REQUIRES(dp->port_mutex)
continue;
}
- rxqs[i]->pmd = rr_numa_get_pmd(non_local_numa);
+ rxqs[i]->pmd = rr_numa_get_pmd(non_local_numa, assign_cyc);
VLOG_WARN("There's no available (non-isolated) pmd thread "
"on numa node %d. Queue %d on port \'%s\' will "
@@ -4413,11 +4445,20 @@ rxq_scheduling(struct dp_netdev *dp, bool pinned) OVS_REQUIRES(dp->port_mutex)
rxqs[i]->pmd->core_id, rxqs[i]->pmd->numa_id);
} else {
- rxqs[i]->pmd = rr_numa_get_pmd(numa);
- VLOG_INFO("Core %d on numa node %d assigned port \'%s\' "
- "rx queue %d (measured processing cycles %"PRIu64").",
- rxqs[i]->pmd->core_id, numa_id,
- netdev_rxq_get_name(rxqs[i]->rx),
- netdev_rxq_get_queue_id(rxqs[i]->rx),
- dp_netdev_rxq_get_cycles(rxqs[i], RXQ_CYCLES_PROC_HIST));
+ rxqs[i]->pmd = rr_numa_get_pmd(numa, assign_cyc);
+ if (assign_cyc) {
+ VLOG_INFO("Core %d on numa node %d assigned port \'%s\' "
+ "rx queue %d "
+ "(measured processing cycles %"PRIu64").",
+ rxqs[i]->pmd->core_id, numa_id,
+ netdev_rxq_get_name(rxqs[i]->rx),
+ netdev_rxq_get_queue_id(rxqs[i]->rx),
+ dp_netdev_rxq_get_cycles(rxqs[i],
+ RXQ_CYCLES_PROC_HIST));
+ } else {
+ VLOG_INFO("Core %d on numa node %d assigned port \'%s\' "
+ "rx queue %d.", rxqs[i]->pmd->core_id, numa_id,
+ netdev_rxq_get_name(rxqs[i]->rx),
+ netdev_rxq_get_queue_id(rxqs[i]->rx));
+ }
}
}
diff --git a/tests/pmd.at b/tests/pmd.at
index 4cae6c8..1f952f3 100644
--- a/tests/pmd.at
+++ b/tests/pmd.at
@@ -62,5 +62,6 @@ m4_define([CHECK_PMD_THREADS_CREATED], [
m4_define([SED_NUMA_CORE_PATTERN], ["s/\(numa_id \)[[0-9]]*\( core_id \)[[0-9]]*:/\1<cleared>\2<cleared>:/"])
-m4_define([SED_NUMA_CORE_QUEUE_PATTERN], ["s/1 2 5 6/<group>/;s/0 3 4 7/<group>/"])
+m4_define([SED_NUMA_CORE_QUEUE_CYC_PATTERN], ["s/1 2 5 6/<group>/;s/0 3 4 7/<group>/"])
+m4_define([SED_NUMA_CORE_QUEUE_PQ_PATTERN], ["s/1 3 5 7/<group>/;s/0 2 4 6/<group>/"])
m4_define([DUMMY_NUMA], [--dummy-numa="0,0,0,0"])
@@ -146,9 +147,16 @@ pmd thread numa_id <cleared> core_id <cleared>:
])
+AT_CHECK([ovs-vsctl set Open_vSwitch . other_config:pmd-rxq-assign=cycles])
TMP=$(cat ovs-vswitchd.log | wc -l | tr -d [[:blank:]])
AT_CHECK([ovs-vsctl set Open_vSwitch . other_config:pmd-cpu-mask=0x3])
CHECK_PMD_THREADS_CREATED([2], [], [+$TMP])
-AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | sed ':a;/AVAIL$/{N;s/\n//;ba;}' | parse_pmd_rxq_show_group | sed SED_NUMA_CORE_QUEUE_PATTERN], [0], [dnl
+AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | sed ':a;/AVAIL$/{N;s/\n//;ba;}' | parse_pmd_rxq_show_group | sed SED_NUMA_CORE_QUEUE_CYC_PATTERN], [0], [dnl
+port: p0 queue-id: <group>
+port: p0 queue-id: <group>
+])
+
+AT_CHECK([ovs-vsctl set Open_vSwitch . other_config:pmd-rxq-assign=roundrobin])
+AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | sed ':a;/AVAIL$/{N;s/\n//;ba;}' | parse_pmd_rxq_show_group | sed SED_NUMA_CORE_QUEUE_PQ_PATTERN], [0], [dnl
port: p0 queue-id: <group>
port: p0 queue-id: <group>
diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml
index e318151..91d132d 100644
--- a/vswitchd/vswitch.xml
+++ b/vswitchd/vswitch.xml
@@ -433,4 +433,28 @@
</column>
+ <column name="other_config" key="pmd-rxq-assign"
+ type='{"type": "string",
+ "enum": ["set", ["cycles", "roundrobin"]]}'>
+ <p>
+ Specifies how RX queues will be automatically assigned to CPU cores.
+ Options:
+ <dl>
+ <dt><code>cycles</code></dt>
+ <dd>Rxqs will be sorted by order of measured processing cycles
+ before being assigned to CPU cores.</dd>
+ <dt><code>roundrobin</code></dt>
+ <dd>Rxqs will be round-robined across CPU cores.</dd>
+ </dl>
+ </p>
+ <p>
+ The default value is <code>cycles</code>.
+ </p>
+ <p>
+ Changing this value will affect an automatic re-assignment of Rxqs to
+ CPUs. Note: Rxqs mapped to CPU cores with
+ <code>pmd-rxq-affinity</code> are unaffected.
+ </p>
+ </column>
+
<column name="other_config" key="n-handler-threads"
type='{"type": "integer", "minInteger": 1}'>
--
1.8.3.1

@ -1,357 +0,0 @@
From 9b4f08cdcaf253175edda088683bdd3db9e4c097 Mon Sep 17 00:00:00 2001
From: Vishal Deep Ajmera <vishal.deep.ajmera@ericsson.com>
Date: Fri, 27 Jul 2018 23:56:37 +0530
Subject: [PATCH] dpif-netdev: Avoid reordering of packets in a batch with same
megaflow
OVS reads packets in batches from a given port and packets in the
batch are subjected to potentially 3 levels of lookups to identify
the datapath megaflow entry (or flow) associated with the packet.
Each megaflow entry has a dedicated buffer in which packets that match
the flow classification criteria are collected. This buffer helps OVS
perform batch processing for all packets associated with a given flow.
Each packet in the received batch is first subjected to lookup in the
Exact Match Cache (EMC). Each EMC entry will point to a flow. If the
EMC lookup is successful, the packet is moved from the rx batch to the
per-flow buffer.
Packets that did not match any EMC entry are rearranged in the rx batch
at the beginning and are now subjected to a lookup in the megaflow cache.
Packets that match a megaflow cache entry are *appended* to the per-flow
buffer.
Packets that do not match any megaflow entry are subjected to slow-path
processing through the upcall mechanism. This cannot change the order of
packets as by definition upcall processing is only done for packets
without matching megaflow entry.
The EMC entry match fields encompass all potentially significant header
fields, typically more than specified in the associated flow's match
criteria. Hence, multiple EMC entries can point to the same flow. Given
that per-flow batching happens at each lookup stage, packets belonging
to the same megaflow can get re-ordered because some packets match EMC
entries while others do not.
The following example can illustrate the issue better. Consider
following batch of packets (labelled P1 to P8) associated with a single
TCP connection and associated with a single flow. Let us assume that
packets with just the ACK bit set in TCP flags have been received in a
prior batch also and a corresponding EMC entry exists.
1. P1 (TCP Flag: ACK)
2. P2 (TCP Flag: ACK)
3. P3 (TCP Flag: ACK)
4. P4 (TCP Flag: ACK, PSH)
5. P5 (TCP Flag: ACK)
6. P6 (TCP Flag: ACK)
7. P7 (TCP Flag: ACK)
8. P8 (TCP Flag: ACK)
The megaflow classification criteria does not include TCP flags while
the EMC match criteria does. Thus, all packets other than P4 match
the existing EMC entry and are moved to the per-flow packet batch.
Subsequently, packet P4 is moved to the same per-flow packet batch as
a result of the megaflow lookup. Though the packets have all been
correctly classified as being associated with the same flow, the
packet order has not been preserved because of the per-flow batching
performed during the EMC lookup stage. This packet re-ordering has
performance implications for TCP applications.
This patch preserves the packet ordering by performing the per-flow
batching after both the EMC and megaflow lookups are complete. As an
optimization, packets are flow-batched in emc processing till any
packet in the batch has an EMC miss.
A new flow map is maintained to keep the original order of packet
along with flow information. Post fastpath processing, packets from
flow map are *appended* to per-flow buffer.
Signed-off-by: Vishal Deep Ajmera <vishal.deep.ajmera@ericsson.com>
Co-authored-by: Venkatesan Pradeep <venkatesan.pradeep@ericsson.com>
Signed-off-by: Venkatesan Pradeep <venkatesan.pradeep@ericsson.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>
---
lib/dpif-netdev.c | 125 +++++++++++++++++++++++++++++++++++++++-------
1 file changed, 106 insertions(+), 19 deletions(-)
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 7f836bb18..807a46250 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -244,6 +244,13 @@ struct dpcls_rule {
/* 'flow' must be the last field, additional space is allocated here. */
};
+/* Data structure to keep packet order till fastpath processing. */
+struct dp_packet_flow_map {
+ struct dp_packet *packet;
+ struct dp_netdev_flow *flow;
+ uint16_t tcp_flags;
+};
+
static void dpcls_init(struct dpcls *);
static void dpcls_destroy(struct dpcls *);
static void dpcls_sort_subtable_vector(struct dpcls *);
@@ -5765,6 +5772,19 @@ dp_netdev_queue_batches(struct dp_packet *pkt,
packet_batch_per_flow_update(batch, pkt, tcp_flags);
}
+static inline void
+packet_enqueue_to_flow_map(struct dp_packet *packet,
+ struct dp_netdev_flow *flow,
+ uint16_t tcp_flags,
+ struct dp_packet_flow_map *flow_map,
+ size_t index)
+{
+ struct dp_packet_flow_map *map = &flow_map[index];
+ map->flow = flow;
+ map->packet = packet;
+ map->tcp_flags = tcp_flags;
+}
+
/* SMC lookup function for a batch of packets.
* By doing batching SMC lookup, we can use prefetch
* to hide memory access latency.
@@ -5774,8 +5794,9 @@ smc_lookup_batch(struct dp_netdev_pmd_thread *pmd,
struct netdev_flow_key *keys,
struct netdev_flow_key **missed_keys,
struct dp_packet_batch *packets_,
- struct packet_batch_per_flow batches[],
- size_t *n_batches, const int cnt)
+ const int cnt,
+ struct dp_packet_flow_map *flow_map,
+ uint8_t *index_map)
{
int i;
struct dp_packet *packet;
@@ -5783,6 +5804,8 @@ smc_lookup_batch(struct dp_netdev_pmd_thread *pmd,
struct dfc_cache *cache = &pmd->flow_cache;
struct smc_cache *smc_cache = &cache->smc_cache;
const struct cmap_node *flow_node;
+ int recv_idx;
+ uint16_t tcp_flags;
/* Prefetch buckets for all packets */
for (i = 0; i < cnt; i++) {
@@ -5793,6 +5816,8 @@ smc_lookup_batch(struct dp_netdev_pmd_thread *pmd,
struct dp_netdev_flow *flow = NULL;
flow_node = smc_entry_get(pmd, keys[i].hash);
bool hit = false;
+ /* Get the original order of this packet in received batch. */
+ recv_idx = index_map[i];
if (OVS_LIKELY(flow_node != NULL)) {
CMAP_NODE_FOR_EACH (flow, node, flow_node) {
@@ -5800,12 +5825,17 @@ smc_lookup_batch(struct dp_netdev_pmd_thread *pmd,
* number, we need to verify that the input ports match. */
if (OVS_LIKELY(dpcls_rule_matches_key(&flow->cr, &keys[i]) &&
flow->flow.in_port.odp_port == packet->md.in_port.odp_port)) {
+ tcp_flags = miniflow_get_tcp_flags(&keys[i].mf);
+
/* SMC hit and emc miss, we insert into EMC */
keys[i].len =
netdev_flow_key_size(miniflow_n_values(&keys[i].mf));
emc_probabilistic_insert(pmd, &keys[i], flow);
- dp_netdev_queue_batches(packet, flow,
- miniflow_get_tcp_flags(&keys[i].mf), batches, n_batches);
+ /* Add these packets into the flow map in the same order
+ * as received.
+ */
+ packet_enqueue_to_flow_map(packet, flow, tcp_flags,
+ flow_map, recv_idx);
n_smc_hit++;
hit = true;
break;
@@ -5819,6 +5849,10 @@ smc_lookup_batch(struct dp_netdev_pmd_thread *pmd,
/* SMC missed. Group missed packets together at
* the beginning of the 'packets' array. */
dp_packet_batch_refill(packets_, packet, i);
+
+ /* Preserve the order of packet for flow batching. */
+ index_map[n_missed] = recv_idx;
+
/* Put missed keys to the pointer arrays return to the caller */
missed_keys[n_missed++] = &keys[i];
}
@@ -5847,6 +5881,8 @@ dfc_processing(struct dp_netdev_pmd_thread *pmd,
struct netdev_flow_key *keys,
struct netdev_flow_key **missed_keys,
struct packet_batch_per_flow batches[], size_t *n_batches,
+ struct dp_packet_flow_map *flow_map,
+ size_t *n_flows, uint8_t *index_map,
bool md_is_valid, odp_port_t port_no)
{
struct netdev_flow_key *key = &keys[0];
@@ -5858,6 +5894,8 @@ dfc_processing(struct dp_netdev_pmd_thread *pmd,
int i;
uint16_t tcp_flags;
bool smc_enable_db;
+ size_t map_cnt = 0;
+ bool batch_enable = true;
atomic_read_relaxed(&pmd->dp->smc_enable_db, &smc_enable_db);
atomic_read_relaxed(&pmd->dp->emc_insert_min, &cur_min);
@@ -5888,10 +5926,19 @@ dfc_processing(struct dp_netdev_pmd_thread *pmd,
if ((*recirc_depth_get() == 0) &&
dp_packet_has_flow_mark(packet, &mark)) {
flow = mark_to_flow_find(pmd, mark);
- if (flow) {
+ if (OVS_LIKELY(flow)) {
tcp_flags = parse_tcp_flags(packet);
- dp_netdev_queue_batches(packet, flow, tcp_flags, batches,
- n_batches);
+ if (OVS_LIKELY(batch_enable)) {
+ dp_netdev_queue_batches(packet, flow, tcp_flags, batches,
+ n_batches);
+ } else {
+ /* Flow batching should be performed only after fast-path
+ * processing is also completed for packets with emc miss
+ * or else it will result in reordering of packets with
+ * same datapath flows. */
+ packet_enqueue_to_flow_map(packet, flow, tcp_flags,
+ flow_map, map_cnt++);
+ }
continue;
}
}
@@ -5914,13 +5961,27 @@ dfc_processing(struct dp_netdev_pmd_thread *pmd,
}
if (OVS_LIKELY(flow)) {
tcp_flags = miniflow_get_tcp_flags(&key->mf);
- dp_netdev_queue_batches(packet, flow, tcp_flags, batches,
- n_batches);
n_emc_hit++;
+ if (OVS_LIKELY(batch_enable)) {
+ dp_netdev_queue_batches(packet, flow, tcp_flags, batches,
+ n_batches);
+ } else {
+ /* Flow batching should be performed only after fast-path
+ * processing is also completed for packets with emc miss
+ * or else it will result in reordering of packets with
+ * same datapath flows. */
+ packet_enqueue_to_flow_map(packet, flow, tcp_flags,
+ flow_map, map_cnt++);
+ }
} else {
/* Exact match cache missed. Group missed packets together at
* the beginning of the 'packets' array. */
dp_packet_batch_refill(packets_, packet, i);
+
+ /* Preserve the order of packet for flow batching. */
+ index_map[n_missed] = map_cnt;
+ flow_map[map_cnt++].flow = NULL;
+
/* 'key[n_missed]' contains the key of the current packet and it
* will be passed to SMC lookup. The next key should be extracted
* to 'keys[n_missed + 1]'.
@@ -5928,8 +5989,13 @@ dfc_processing(struct dp_netdev_pmd_thread *pmd,
* which will be returned to the caller for future processing. */
missed_keys[n_missed] = key;
key = &keys[++n_missed];
+
+ /* Skip batching for subsequent packets to avoid reordering. */
+ batch_enable = false;
}
}
+ /* Count of packets which are not flow batched. */
+ *n_flows = map_cnt;
pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_EXACT_HIT, n_emc_hit);
@@ -5938,8 +6004,8 @@ dfc_processing(struct dp_netdev_pmd_thread *pmd,
}
/* Packets miss EMC will do a batch lookup in SMC if enabled */
- smc_lookup_batch(pmd, keys, missed_keys, packets_, batches,
- n_batches, n_missed);
+ smc_lookup_batch(pmd, keys, missed_keys, packets_,
+ n_missed, flow_map, index_map);
return dp_packet_batch_size(packets_);
}
@@ -6026,8 +6092,8 @@ static inline void
fast_path_processing(struct dp_netdev_pmd_thread *pmd,
struct dp_packet_batch *packets_,
struct netdev_flow_key **keys,
- struct packet_batch_per_flow batches[],
- size_t *n_batches,
+ struct dp_packet_flow_map *flow_map,
+ uint8_t *index_map,
odp_port_t in_port)
{
const size_t cnt = dp_packet_batch_size(packets_);
@@ -6107,6 +6173,9 @@ fast_path_processing(struct dp_netdev_pmd_thread *pmd,
DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
struct dp_netdev_flow *flow;
+ /* Get the original order of this packet in received batch. */
+ int recv_idx = index_map[i];
+ uint16_t tcp_flags;
if (OVS_UNLIKELY(!rules[i])) {
continue;
@@ -6117,9 +6186,12 @@ fast_path_processing(struct dp_netdev_pmd_thread *pmd,
smc_insert(pmd, keys[i], hash);
emc_probabilistic_insert(pmd, keys[i], flow);
- dp_netdev_queue_batches(packet, flow,
- miniflow_get_tcp_flags(&keys[i]->mf),
- batches, n_batches);
+ /* Add these packets into the flow map in the same order
+ * as received.
+ */
+ tcp_flags = miniflow_get_tcp_flags(&keys[i]->mf);
+ packet_enqueue_to_flow_map(packet, flow, tcp_flags,
+ flow_map, recv_idx);
}
pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MASKED_HIT,
@@ -6152,18 +6224,34 @@ dp_netdev_input__(struct dp_netdev_pmd_thread *pmd,
struct netdev_flow_key *missed_keys[PKT_ARRAY_SIZE];
struct packet_batch_per_flow batches[PKT_ARRAY_SIZE];
size_t n_batches;
+ struct dp_packet_flow_map flow_map[PKT_ARRAY_SIZE];
+ uint8_t index_map[PKT_ARRAY_SIZE];
+ size_t n_flows, i;
+
odp_port_t in_port;
n_batches = 0;
dfc_processing(pmd, packets, keys, missed_keys, batches, &n_batches,
- md_is_valid, port_no);
+ flow_map, &n_flows, index_map, md_is_valid, port_no);
+
if (!dp_packet_batch_is_empty(packets)) {
/* Get ingress port from first packet's metadata. */
in_port = packets->packets[0]->md.in_port.odp_port;
fast_path_processing(pmd, packets, missed_keys,
- batches, &n_batches, in_port);
+ flow_map, index_map, in_port);
}
+ /* Batch rest of packets which are in flow map. */
+ for (i = 0; i < n_flows; i++) {
+ struct dp_packet_flow_map *map = &flow_map[i];
+
+ if (OVS_UNLIKELY(!map->flow)) {
+ continue;
+ }
+ dp_netdev_queue_batches(map->packet, map->flow, map->tcp_flags,
+ batches, &n_batches);
+ }
+
/* All the flow batches need to be reset before any call to
* packet_batch_per_flow_execute() as it could potentially trigger
* recirculation. When a packet matching flow j happens to be
@@ -6173,7 +6261,6 @@ dp_netdev_input__(struct dp_netdev_pmd_thread *pmd,
* already its own batches[k] still waiting to be served. So if its
* batch member is not reset, the recirculated packet would be wrongly
* appended to batches[k] of the 1st call to dp_netdev_input__(). */
- size_t i;
for (i = 0; i < n_batches; i++) {
batches[i].flow->batch = NULL;
}
--
2.17.1

@ -1,669 +0,0 @@
From 4c91bc3bf8c6005db5795fe51632c1feedc4719e Mon Sep 17 00:00:00 2001
From: Matteo Croce <mcroce@redhat.com>
Date: Tue, 18 Sep 2018 14:56:37 +0200
Subject: [PATCH v2] dpif-netlink: don't allocate per thread netlink sockets
When using the kernel datapath, OVS allocates a pool of sockets to handle
netlink events. The number of sockets is: ports * n-handler-threads, where
n-handler-threads is user configurable and defaults to 3/4*number of cores.
This because vswitchd starts n-handler-threads threads, each one with a
netlink socket for every port of the switch. Every thread then, starts
listening on events on its set of sockets with epoll().
On setup with lot of CPUs and ports, the number of sockets easily hits
the process file descriptor limit, and ovs-vswitchd will exit with -EMFILE.
Change the number of allocated sockets to just one per port by moving
the socket array from a per handler structure to a per datapath one,
and let all the handlers share the same sockets by using EPOLLEXCLUSIVE
epoll flag which avoids duplicate events, on systems that support it.
The patch was tested on a 56 core machine running Linux 4.18 and latest
Open vSwitch. A bridge was created with 2000+ ports, some of them being
veth interfaces with the peer outside the bridge. The latency of the upcall
is measured by setting a single 'action=controller,local' OpenFlow rule to
force all the packets going to the slow path and then to the local port.
A tool[1] injects some packets to the veth outside the bridge, and measures
the delay until the packet is captured on the local port. The rx timestamp
is get from the socket ancillary data in the attribute SO_TIMESTAMPNS, to
avoid having the scheduler delay in the measured time.
The first test measures the average latency for an upcall generated from
a single port. To measure it 100k packets, one every msec, are sent to a
single port and the latencies are measured.
The second test is meant to check latency fairness among ports, namely if
latency is equal between ports or if some ports have lower priority.
The previous test is repeated for every port, the average of the average
latencies and the standard deviation between averages is measured.
The third test serves to measure responsiveness under load. Heavy traffic
is sent through all ports, latency and packet loss is measured
on a single idle port.
The fourth test is all about fairness. Heavy traffic is injected in all
ports but one, latency and packet loss is measured on the single idle port.
This is the test setup:
# nproc
56
# ovs-vsctl show |grep -c Port
2223
# ovs-ofctl dump-flows ovs_upc_br
cookie=0x0, duration=4.827s, table=0, n_packets=0, n_bytes=0, actions=CONTROLLER:65535,LOCAL
# uname -a
Linux fc28 4.18.7-200.fc28.x86_64 #1 SMP Mon Sep 10 15:44:45 UTC 2018 x86_64 x86_64 x86_64 GNU/Linux
And these are the results of the tests:
Stock OVS Patched
netlink sockets
in use by vswitchd
lsof -p $(pidof ovs-vswitchd) \
|grep -c GENERIC 91187 2227
Test 1
one port latency
min/avg/max/mdev (us) 2.7/6.6/238.7/1.8 1.6/6.8/160.6/1.7
Test 2
all port
avg latency/mdev (us) 6.51/0.97 6.86/0.17
Test 3
single port latency
under load
avg/mdev (us) 7.5/5.9 3.8/4.8
packet loss 95 % 62 %
Test 4
idle port latency
under load
min/avg/max/mdev (us) 0.8/1.5/210.5/0.9 1.0/2.1/344.5/1.2
packet loss 94 % 4 %
CPU and RAM usage seems not to be affected, the resource usage of vswitchd
idle with 2000+ ports is unchanged:
# ps u $(pidof ovs-vswitchd)
USER PID %CPU %MEM VSZ RSS TTY STAT START TIME COMMAND
openvsw+ 5430 54.3 0.3 4263964 510968 pts/1 RLl+ 16:20 0:50 ovs-vswitchd
Additionally, to check if vswitchd is thread safe with this patch, the
following test was run for circa 48 hours: on a 56 core machine, a
bridge with kernel datapath is filled with 2200 dummy interfaces and 22
veth, then 22 traffic generators are run in parallel piping traffic into
the veths peers outside the bridge.
To generate as many upcalls as possible, all packets were forced to the
slowpath with an openflow rule like 'action=controller,local' and packet
size was set to 64 byte. Also, to avoid overflowing the FDB early and
slowing down the upcall processing, generated mac addresses were restricted
to a small interval. vswitchd ran without problems for 48+ hours,
obviously with all the handler threads with almost 99% CPU usage.
[1] https://github.com/teknoraver/network-tools/blob/master/weed.c
Signed-off-by: Matteo Croce <mcroce@redhat.com>
---
v1 -> v2:
- define EPOLLEXCLUSIVE on systems with older kernel headers
- explain the thread safety test in the commit message
lib/dpif-netlink.c | 311 ++++++++++++---------------------------------
1 file changed, 82 insertions(+), 229 deletions(-)
diff --git a/lib/dpif-netlink.c b/lib/dpif-netlink.c
index e6d5a6ec5..bb565ffee 100644
--- a/lib/dpif-netlink.c
+++ b/lib/dpif-netlink.c
@@ -78,6 +78,10 @@ enum { MAX_PORTS = USHRT_MAX };
#define FLOW_DUMP_MAX_BATCH 50
#define OPERATE_MAX_OPS 50
+#ifndef EPOLLEXCLUSIVE
+#define EPOLLEXCLUSIVE (1u << 28)
+#endif
+
struct dpif_netlink_dp {
/* Generic Netlink header. */
uint8_t cmd;
@@ -170,7 +174,6 @@ struct dpif_windows_vport_sock {
#endif
struct dpif_handler {
- struct dpif_channel *channels;/* Array of channels for each handler. */
struct epoll_event *epoll_events;
int epoll_fd; /* epoll fd that includes channel socks. */
int n_events; /* Num events returned by epoll_wait(). */
@@ -193,6 +196,7 @@ struct dpif_netlink {
struct fat_rwlock upcall_lock;
struct dpif_handler *handlers;
uint32_t n_handlers; /* Num of upcall handlers. */
+ struct dpif_channel *channels; /* Array of channels for each port. */
int uc_array_size; /* Size of 'handler->channels' and */
/* 'handler->epoll_events'. */
@@ -331,43 +335,6 @@ open_dpif(const struct dpif_netlink_dp *dp, struct dpif **dpifp)
return 0;
}
-/* Destroys the netlink sockets pointed by the elements in 'socksp'
- * and frees the 'socksp'. */
-static void
-vport_del_socksp__(struct nl_sock **socksp, uint32_t n_socks)
-{
- size_t i;
-
- for (i = 0; i < n_socks; i++) {
- nl_sock_destroy(socksp[i]);
- }
-
- free(socksp);
-}
-
-/* Creates an array of netlink sockets. Returns an array of the
- * corresponding pointers. Records the error in 'error'. */
-static struct nl_sock **
-vport_create_socksp__(uint32_t n_socks, int *error)
-{
- struct nl_sock **socksp = xzalloc(n_socks * sizeof *socksp);
- size_t i;
-
- for (i = 0; i < n_socks; i++) {
- *error = nl_sock_create(NETLINK_GENERIC, &socksp[i]);
- if (*error) {
- goto error;
- }
- }
-
- return socksp;
-
-error:
- vport_del_socksp__(socksp, n_socks);
-
- return NULL;
-}
-
#ifdef _WIN32
static void
vport_delete_sock_pool(struct dpif_handler *handler)
@@ -422,129 +389,34 @@ error:
vport_delete_sock_pool(handler);
return error;
}
-
-/* Returns an array pointers to netlink sockets. The sockets are picked from a
- * pool. Records the error in 'error'. */
-static struct nl_sock **
-vport_create_socksp_windows(struct dpif_netlink *dpif, int *error)
- OVS_REQ_WRLOCK(dpif->upcall_lock)
-{
- uint32_t n_socks = dpif->n_handlers;
- struct nl_sock **socksp;
- size_t i;
-
- ovs_assert(n_socks <= 1);
- socksp = xzalloc(n_socks * sizeof *socksp);
-
- /* Pick netlink sockets to use in a round-robin fashion from each
- * handler's pool of sockets. */
- for (i = 0; i < n_socks; i++) {
- struct dpif_handler *handler = &dpif->handlers[i];
- struct dpif_windows_vport_sock *sock_pool = handler->vport_sock_pool;
- size_t index = handler->last_used_pool_idx;
-
- /* A pool of sockets is allocated when the handler is initialized. */
- if (sock_pool == NULL) {
- free(socksp);
- *error = EINVAL;
- return NULL;
- }
-
- ovs_assert(index < VPORT_SOCK_POOL_SIZE);
- socksp[i] = sock_pool[index].nl_sock;
- socksp[i] = sock_pool[index].nl_sock;
- ovs_assert(socksp[i]);
- index = (index == VPORT_SOCK_POOL_SIZE - 1) ? 0 : index + 1;
- handler->last_used_pool_idx = index;
- }
-
- return socksp;
-}
-
-static void
-vport_del_socksp_windows(struct dpif_netlink *dpif, struct nl_sock **socksp)
-{
- free(socksp);
-}
#endif /* _WIN32 */
-static struct nl_sock **
-vport_create_socksp(struct dpif_netlink *dpif, int *error)
-{
-#ifdef _WIN32
- return vport_create_socksp_windows(dpif, error);
-#else
- return vport_create_socksp__(dpif->n_handlers, error);
-#endif
-}
-
-static void
-vport_del_socksp(struct dpif_netlink *dpif, struct nl_sock **socksp)
-{
-#ifdef _WIN32
- vport_del_socksp_windows(dpif, socksp);
-#else
- vport_del_socksp__(socksp, dpif->n_handlers);
-#endif
-}
-
-/* Given the array of pointers to netlink sockets 'socksp', returns
- * the array of corresponding pids. If the 'socksp' is NULL, returns
- * a single-element array of value 0. */
-static uint32_t *
-vport_socksp_to_pids(struct nl_sock **socksp, uint32_t n_socks)
-{
- uint32_t *pids;
-
- if (!socksp) {
- pids = xzalloc(sizeof *pids);
- } else {
- size_t i;
-
- pids = xzalloc(n_socks * sizeof *pids);
- for (i = 0; i < n_socks; i++) {
- pids[i] = nl_sock_pid(socksp[i]);
- }
- }
-
- return pids;
-}
-
-/* Given the port number 'port_idx', extracts the pids of netlink sockets
- * associated to the port and assigns it to 'upcall_pids'. */
+/* Given the port number 'port_idx', extracts the pid of netlink socket
+ * associated to the port and assigns it to 'upcall_pid'. */
static bool
-vport_get_pids(struct dpif_netlink *dpif, uint32_t port_idx,
- uint32_t **upcall_pids)
+vport_get_pid(struct dpif_netlink *dpif, uint32_t port_idx,
+ uint32_t *upcall_pid)
{
- uint32_t *pids;
- size_t i;
-
/* Since the nl_sock can only be assigned in either all
- * or none "dpif->handlers" channels, the following check
+ * or none "dpif" channels, the following check
* would suffice. */
- if (!dpif->handlers[0].channels[port_idx].sock) {
+ if (!dpif->channels[port_idx].sock) {
return false;
}
ovs_assert(!WINDOWS || dpif->n_handlers <= 1);
- pids = xzalloc(dpif->n_handlers * sizeof *pids);
-
- for (i = 0; i < dpif->n_handlers; i++) {
- pids[i] = nl_sock_pid(dpif->handlers[i].channels[port_idx].sock);
- }
-
- *upcall_pids = pids;
+ *upcall_pid = nl_sock_pid(dpif->channels[port_idx].sock);
return true;
}
static int
-vport_add_channels(struct dpif_netlink *dpif, odp_port_t port_no,
- struct nl_sock **socksp)
+vport_add_channel(struct dpif_netlink *dpif, odp_port_t port_no,
+ struct nl_sock *socksp)
{
struct epoll_event event;
uint32_t port_idx = odp_to_u32(port_no);
- size_t i, j;
+ size_t i;
int error;
if (dpif->handlers == NULL) {
@@ -553,7 +425,7 @@ vport_add_channels(struct dpif_netlink *dpif, odp_port_t port_no,
/* We assume that the datapath densely chooses port numbers, which can
* therefore be used as an index into 'channels' and 'epoll_events' of
- * 'dpif->handler'. */
+ * 'dpif'. */
if (port_idx >= dpif->uc_array_size) {
uint32_t new_size = port_idx + 1;
@@ -563,15 +435,15 @@ vport_add_channels(struct dpif_netlink *dpif, odp_port_t port_no,
return EFBIG;
}
- for (i = 0; i < dpif->n_handlers; i++) {
- struct dpif_handler *handler = &dpif->handlers[i];
+ dpif->channels = xrealloc(dpif->channels,
+ new_size * sizeof *dpif->channels);
- handler->channels = xrealloc(handler->channels,
- new_size * sizeof *handler->channels);
+ for (i = dpif->uc_array_size; i < new_size; i++) {
+ dpif->channels[i].sock = NULL;
+ }
- for (j = dpif->uc_array_size; j < new_size; j++) {
- handler->channels[j].sock = NULL;
- }
+ for (i = 0; i < dpif->n_handlers; i++) {
+ struct dpif_handler *handler = &dpif->handlers[i];
handler->epoll_events = xrealloc(handler->epoll_events,
new_size * sizeof *handler->epoll_events);
@@ -581,33 +453,33 @@ vport_add_channels(struct dpif_netlink *dpif, odp_port_t port_no,
}
memset(&event, 0, sizeof event);
- event.events = EPOLLIN;
+ event.events = EPOLLIN | EPOLLEXCLUSIVE;
event.data.u32 = port_idx;
for (i = 0; i < dpif->n_handlers; i++) {
struct dpif_handler *handler = &dpif->handlers[i];
#ifndef _WIN32
- if (epoll_ctl(handler->epoll_fd, EPOLL_CTL_ADD, nl_sock_fd(socksp[i]),
+ if (epoll_ctl(handler->epoll_fd, EPOLL_CTL_ADD, nl_sock_fd(socksp),
&event) < 0) {
error = errno;
goto error;
}
#endif
- dpif->handlers[i].channels[port_idx].sock = socksp[i];
- dpif->handlers[i].channels[port_idx].last_poll = LLONG_MIN;
}
+ dpif->channels[port_idx].sock = socksp;
+ dpif->channels[port_idx].last_poll = LLONG_MIN;
return 0;
error:
- for (j = 0; j < i; j++) {
#ifndef _WIN32
- epoll_ctl(dpif->handlers[j].epoll_fd, EPOLL_CTL_DEL,
- nl_sock_fd(socksp[j]), NULL);
-#endif
- dpif->handlers[j].channels[port_idx].sock = NULL;
+ while (i--) {
+ epoll_ctl(dpif->handlers[i].epoll_fd, EPOLL_CTL_DEL,
+ nl_sock_fd(socksp), NULL);
}
+#endif
+ dpif->channels[port_idx].sock = NULL;
return error;
}
@@ -618,14 +490,8 @@ vport_del_channels(struct dpif_netlink *dpif, odp_port_t port_no)
uint32_t port_idx = odp_to_u32(port_no);
size_t i;
- if (!dpif->handlers || port_idx >= dpif->uc_array_size) {
- return;
- }
-
- /* Since the sock can only be assigned in either all or none
- * of "dpif->handlers" channels, the following check would
- * suffice. */
- if (!dpif->handlers[0].channels[port_idx].sock) {
+ if (!dpif->handlers || port_idx >= dpif->uc_array_size
+ || !dpif->channels[port_idx].sock) {
return;
}
@@ -633,12 +499,14 @@ vport_del_channels(struct dpif_netlink *dpif, odp_port_t port_no)
struct dpif_handler *handler = &dpif->handlers[i];
#ifndef _WIN32
epoll_ctl(handler->epoll_fd, EPOLL_CTL_DEL,
- nl_sock_fd(handler->channels[port_idx].sock), NULL);
- nl_sock_destroy(handler->channels[port_idx].sock);
+ nl_sock_fd(dpif->channels[port_idx].sock), NULL);
#endif
- handler->channels[port_idx].sock = NULL;
handler->event_offset = handler->n_events = 0;
}
+#ifndef _WIN32
+ nl_sock_destroy(dpif->channels[port_idx].sock);
+#endif
+ dpif->channels[port_idx].sock = NULL;
}
static void
@@ -655,10 +523,7 @@ destroy_all_channels(struct dpif_netlink *dpif)
struct dpif_netlink_vport vport_request;
uint32_t upcall_pids = 0;
- /* Since the sock can only be assigned in either all or none
- * of "dpif->handlers" channels, the following check would
- * suffice. */
- if (!dpif->handlers[0].channels[i].sock) {
+ if (!dpif->channels[i].sock) {
continue;
}
@@ -679,11 +544,11 @@ destroy_all_channels(struct dpif_netlink *dpif)
dpif_netlink_handler_uninit(handler);
free(handler->epoll_events);
- free(handler->channels);
}
-
+ free(dpif->channels);
free(dpif->handlers);
dpif->handlers = NULL;
+ dpif->channels = NULL;
dpif->n_handlers = 0;
dpif->uc_array_size = 0;
}
@@ -846,13 +711,12 @@ dpif_netlink_port_add__(struct dpif_netlink *dpif, const char *name,
{
struct dpif_netlink_vport request, reply;
struct ofpbuf *buf;
- struct nl_sock **socksp = NULL;
- uint32_t *upcall_pids;
+ struct nl_sock *socksp = NULL;
+ uint32_t upcall_pids;
int error = 0;
if (dpif->handlers) {
- socksp = vport_create_socksp(dpif, &error);
- if (!socksp) {
+ if (nl_sock_create(NETLINK_GENERIC, &socksp)) {
return error;
}
}
@@ -864,9 +728,9 @@ dpif_netlink_port_add__(struct dpif_netlink *dpif, const char *name,
request.name = name;
request.port_no = *port_nop;
- upcall_pids = vport_socksp_to_pids(socksp, dpif->n_handlers);
- request.n_upcall_pids = socksp ? dpif->n_handlers : 1;
- request.upcall_pids = upcall_pids;
+ upcall_pids = nl_sock_pid(socksp);
+ request.n_upcall_pids = 1;
+ request.upcall_pids = &upcall_pids;
if (options) {
request.options = options->data;
@@ -882,31 +746,27 @@ dpif_netlink_port_add__(struct dpif_netlink *dpif, const char *name,
dpif_name(&dpif->dpif), *port_nop);
}
- vport_del_socksp(dpif, socksp);
+ nl_sock_destroy(socksp);
goto exit;
}
- if (socksp) {
- error = vport_add_channels(dpif, *port_nop, socksp);
- if (error) {
- VLOG_INFO("%s: could not add channel for port %s",
- dpif_name(&dpif->dpif), name);
-
- /* Delete the port. */
- dpif_netlink_vport_init(&request);
- request.cmd = OVS_VPORT_CMD_DEL;
- request.dp_ifindex = dpif->dp_ifindex;
- request.port_no = *port_nop;
- dpif_netlink_vport_transact(&request, NULL, NULL);
- vport_del_socksp(dpif, socksp);
- goto exit;
- }
+ error = vport_add_channel(dpif, *port_nop, socksp);
+ if (error) {
+ VLOG_INFO("%s: could not add channel for port %s",
+ dpif_name(&dpif->dpif), name);
+
+ /* Delete the port. */
+ dpif_netlink_vport_init(&request);
+ request.cmd = OVS_VPORT_CMD_DEL;
+ request.dp_ifindex = dpif->dp_ifindex;
+ request.port_no = *port_nop;
+ dpif_netlink_vport_transact(&request, NULL, NULL);
+ nl_sock_destroy(socksp);
+ goto exit;
}
- free(socksp);
exit:
ofpbuf_delete(buf);
- free(upcall_pids);
return error;
}
@@ -1131,7 +991,7 @@ dpif_netlink_port_query_by_name(const struct dpif *dpif_, const char *devname,
static uint32_t
dpif_netlink_port_get_pid__(const struct dpif_netlink *dpif,
- odp_port_t port_no, uint32_t hash)
+ odp_port_t port_no, uint32_t hash OVS_UNUSED)
OVS_REQ_RDLOCK(dpif->upcall_lock)
{
uint32_t port_idx = odp_to_u32(port_no);
@@ -1141,14 +1001,13 @@ dpif_netlink_port_get_pid__(const struct dpif_netlink *dpif,
/* The ODPP_NONE "reserved" port number uses the "ovs-system"'s
* channel, since it is not heavily loaded. */
uint32_t idx = port_idx >= dpif->uc_array_size ? 0 : port_idx;
- struct dpif_handler *h = &dpif->handlers[hash % dpif->n_handlers];
/* Needs to check in case the socket pointer is changed in between
* the holding of upcall_lock. A known case happens when the main
* thread deletes the vport while the handler thread is handling
* the upcall from that port. */
- if (h->channels[idx].sock) {
- pid = nl_sock_pid(h->channels[idx].sock);
+ if (dpif->channels[idx].sock) {
+ pid = nl_sock_pid(dpif->channels[idx].sock);
}
}
@@ -2382,42 +2241,40 @@ dpif_netlink_refresh_channels(struct dpif_netlink *dpif, uint32_t n_handlers)
dpif_netlink_port_dump_start__(dpif, &dump);
while (!dpif_netlink_port_dump_next__(dpif, &dump, &vport, &buf)) {
uint32_t port_no = odp_to_u32(vport.port_no);
- uint32_t *upcall_pids = NULL;
+ uint32_t upcall_pid;
int error;
if (port_no >= dpif->uc_array_size
- || !vport_get_pids(dpif, port_no, &upcall_pids)) {
- struct nl_sock **socksp = vport_create_socksp(dpif, &error);
+ || !vport_get_pid(dpif, port_no, &upcall_pid)) {
+ struct nl_sock *socksp;
- if (!socksp) {
+ if (nl_sock_create(NETLINK_GENERIC, &socksp)) {
goto error;
}
- error = vport_add_channels(dpif, vport.port_no, socksp);
+ error = vport_add_channel(dpif, vport.port_no, socksp);
if (error) {
VLOG_INFO("%s: could not add channels for port %s",
dpif_name(&dpif->dpif), vport.name);
- vport_del_socksp(dpif, socksp);
+ nl_sock_destroy(socksp);
retval = error;
goto error;
}
- upcall_pids = vport_socksp_to_pids(socksp, dpif->n_handlers);
- free(socksp);
+ upcall_pid = nl_sock_pid(socksp);
}
/* Configure the vport to deliver misses to 'sock'. */
if (vport.upcall_pids[0] == 0
- || vport.n_upcall_pids != dpif->n_handlers
- || memcmp(upcall_pids, vport.upcall_pids, n_handlers * sizeof
- *upcall_pids)) {
+ || vport.n_upcall_pids != 1
+ || upcall_pid != vport.upcall_pids[0]) {
struct dpif_netlink_vport vport_request;
dpif_netlink_vport_init(&vport_request);
vport_request.cmd = OVS_VPORT_CMD_SET;
vport_request.dp_ifindex = dpif->dp_ifindex;
vport_request.port_no = vport.port_no;
- vport_request.n_upcall_pids = dpif->n_handlers;
- vport_request.upcall_pids = upcall_pids;
+ vport_request.n_upcall_pids = 1;
+ vport_request.upcall_pids = &upcall_pid;
error = dpif_netlink_vport_transact(&vport_request, NULL, NULL);
if (error) {
VLOG_WARN_RL(&error_rl,
@@ -2438,11 +2295,9 @@ dpif_netlink_refresh_channels(struct dpif_netlink *dpif, uint32_t n_handlers)
if (port_no < keep_channels_nbits) {
bitmap_set1(keep_channels, port_no);
}
- free(upcall_pids);
continue;
error:
- free(upcall_pids);
vport_del_channels(dpif, vport.port_no);
}
nl_dump_done(&dump);
@@ -2701,7 +2556,7 @@ dpif_netlink_recv__(struct dpif_netlink *dpif, uint32_t handler_id,
while (handler->event_offset < handler->n_events) {
int idx = handler->epoll_events[handler->event_offset].data.u32;
- struct dpif_channel *ch = &dpif->handlers[handler_id].channels[idx];
+ struct dpif_channel *ch = &dpif->channels[idx];
handler->event_offset++;
@@ -2803,16 +2658,14 @@ dpif_netlink_recv_purge__(struct dpif_netlink *dpif)
OVS_REQ_WRLOCK(dpif->upcall_lock)
{
if (dpif->handlers) {
- size_t i, j;
+ size_t i;
+ if (!dpif->channels[0].sock) {
+ return;
+ }
for (i = 0; i < dpif->uc_array_size; i++ ) {
- if (!dpif->handlers[0].channels[i].sock) {
- continue;
- }
- for (j = 0; j < dpif->n_handlers; j++) {
- nl_sock_drain(dpif->handlers[j].channels[i].sock);
- }
+ nl_sock_drain(dpif->channels[i].sock);
}
}
}
--
2.17.1

@ -0,0 +1,179 @@
From 71981938b2db070c130ec717aab141cd9c0fa860 Mon Sep 17 00:00:00 2001
From: Numan Siddique <nusiddiq@redhat.com>
Date: Tue, 6 Nov 2018 11:59:38 +0530
Subject: [PATCH] ovn-nbctl: Fix the ovn-nbctl test "LBs - daemon" which fails
during rpm build
When 'make check' is called by the mock rpm build (which disables networking),
the test "ovn-nbctl: LBs - daemon" fails when it runs the command
"ovn-nbctl lb-add lb0 30.0.0.1a 192.168.10.10:80,192.168.10.20:80". ovn-nbctl
extracts the vip by calling the socket util function 'inet_parse_active()',
and this function blocks when libunbound function ub_resolve() is called
further down. ub_resolve() is a blocking function without timeout and all the
ovs/ovn utilities use this function.
As reported by Timothy Redaelli, the issue can also be reproduced by running
the below commands
$ sudo unshare -mn -- sh -c 'ip addr add dev lo 127.0.0.1 && \
mount --bind /dev/null /etc/resolv.conf && runuser $SUDO_USER'
$ make sandbox SANDBOXFLAGS="--ovn"
$ ovn-nbctl -vsocket_util:off lb-add lb0 30.0.0.1a \
192.168.10.10:80,192.168.10.20:80
To address this issue, this patch adds a new bool argument 'resolve_host' to
the function inet_parse_active() to resolve the host only if it is 'true'.
ovn-nbctl/ovn-northd will pass 'false' when it calls this function to parse
the load balancer values.
Reported-by: Timothy Redaelli <tredaelli@redhat.com>
Reported-at: https://bugzilla.redhat.com/show_bug.cgi?id=1641672
Signed-off-by: Numan Siddique <nusiddiq@redhat.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>
---
lib/socket-util.c | 7 ++++---
lib/socket-util.h | 2 +-
lib/stream.c | 2 +-
ofproto/ofproto-dpif-sflow.c | 2 +-
ovn/northd/ovn-northd.c | 2 +-
ovn/utilities/ovn-nbctl.c | 6 +++---
ovsdb/raft-private.c | 2 +-
7 files changed, 12 insertions(+), 11 deletions(-)
diff --git a/lib/socket-util.c b/lib/socket-util.c
index 504f4cd59..5f82e89c1 100644
--- a/lib/socket-util.c
+++ b/lib/socket-util.c
@@ -518,12 +518,13 @@ exit:
* is optional and defaults to 'default_port' (use 0 to make the kernel choose
* an available port, although this isn't usually appropriate for active
* connections). If 'default_port' is negative, then <port> is required.
+ * It resolves the host if 'resolve_host' is true.
*
* On success, returns true and stores the parsed remote address into '*ss'.
* On failure, logs an error, stores zeros into '*ss', and returns false. */
bool
inet_parse_active(const char *target_, int default_port,
- struct sockaddr_storage *ss)
+ struct sockaddr_storage *ss, bool resolve_host)
{
char *target = xstrdup(target_);
char *port, *host;
@@ -538,7 +539,7 @@ inet_parse_active(const char *target_, int default_port,
ok = false;
} else {
ok = parse_sockaddr_components(ss, host, port, default_port,
- target_, true);
+ target_, resolve_host);
}
if (!ok) {
memset(ss, 0, sizeof *ss);
@@ -575,7 +576,7 @@ inet_open_active(int style, const char *target, int default_port,
int error;
/* Parse. */
- if (!inet_parse_active(target, default_port, &ss)) {
+ if (!inet_parse_active(target, default_port, &ss, true)) {
error = EAFNOSUPPORT;
goto exit;
}
diff --git a/lib/socket-util.h b/lib/socket-util.h
index 6d386304d..a65433d90 100644
--- a/lib/socket-util.h
+++ b/lib/socket-util.h
@@ -49,7 +49,7 @@ ovs_be32 guess_netmask(ovs_be32 ip);
void inet_parse_host_port_tokens(char *s, char **hostp, char **portp);
void inet_parse_port_host_tokens(char *s, char **portp, char **hostp);
bool inet_parse_active(const char *target, int default_port,
- struct sockaddr_storage *ssp);
+ struct sockaddr_storage *ssp, bool resolve_host);
int inet_open_active(int style, const char *target, int default_port,
struct sockaddr_storage *ssp, int *fdp, uint8_t dscp);
diff --git a/lib/stream.c b/lib/stream.c
index 4e15fe0c8..c4dabda39 100644
--- a/lib/stream.c
+++ b/lib/stream.c
@@ -751,7 +751,7 @@ stream_parse_target_with_default_port(const char *target, int default_port,
struct sockaddr_storage *ss)
{
return ((!strncmp(target, "tcp:", 4) || !strncmp(target, "ssl:", 4))
- && inet_parse_active(target + 4, default_port, ss));
+ && inet_parse_active(target + 4, default_port, ss, true));
}
/* Attempts to guess the content type of a stream whose first few bytes were
diff --git a/ofproto/ofproto-dpif-sflow.c b/ofproto/ofproto-dpif-sflow.c
index 62a09b5d1..7da31753c 100644
--- a/ofproto/ofproto-dpif-sflow.c
+++ b/ofproto/ofproto-dpif-sflow.c
@@ -468,7 +468,7 @@ sflow_choose_agent_address(const char *agent_device,
const char *target;
SSET_FOR_EACH (target, targets) {
struct sockaddr_storage ss;
- if (inet_parse_active(target, SFL_DEFAULT_COLLECTOR_PORT, &ss)) {
+ if (inet_parse_active(target, SFL_DEFAULT_COLLECTOR_PORT, &ss, true)) {
/* sFlow only supports target in default routing table with
* packet mark zero.
*/
diff --git a/ovn/northd/ovn-northd.c b/ovn/northd/ovn-northd.c
index 5e61708be..d59fc45ca 100644
--- a/ovn/northd/ovn-northd.c
+++ b/ovn/northd/ovn-northd.c
@@ -3204,7 +3204,7 @@ ip_address_and_port_from_lb_key(const char *key, char **ip_address,
uint16_t *port, int *addr_family)
{
struct sockaddr_storage ss;
- if (!inet_parse_active(key, 0, &ss)) {
+ if (!inet_parse_active(key, 0, &ss, false)) {
static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 1);
VLOG_WARN_RL(&rl, "bad ip address or port for load balancer key %s",
key);
diff --git a/ovn/utilities/ovn-nbctl.c b/ovn/utilities/ovn-nbctl.c
index 42aac2251..09bbcf76a 100644
--- a/ovn/utilities/ovn-nbctl.c
+++ b/ovn/utilities/ovn-nbctl.c
@@ -2553,7 +2553,7 @@ nbctl_lb_add(struct ctl_context *ctx)
}
struct sockaddr_storage ss_vip;
- if (!inet_parse_active(lb_vip, 0, &ss_vip)) {
+ if (!inet_parse_active(lb_vip, 0, &ss_vip, false)) {
ctl_error(ctx, "%s: should be an IP address (or an IP address "
"and a port number with : as a separator).", lb_vip);
return;
@@ -2583,7 +2583,7 @@ nbctl_lb_add(struct ctl_context *ctx)
struct sockaddr_storage ss_dst;
if (lb_vip_port) {
- if (!inet_parse_active(token, -1, &ss_dst)) {
+ if (!inet_parse_active(token, -1, &ss_dst, false)) {
ctl_error(ctx, "%s: should be an IP address and a port "
"number with : as a separator.", token);
goto out;
@@ -2702,7 +2702,7 @@ lb_info_add_smap(const struct nbrec_load_balancer *lb,
const struct smap_node *node = nodes[i];
struct sockaddr_storage ss;
- if (!inet_parse_active(node->key, 0, &ss)) {
+ if (!inet_parse_active(node->key, 0, &ss, false)) {
continue;
}
diff --git a/ovsdb/raft-private.c b/ovsdb/raft-private.c
index 07996e35b..e5e2c29cf 100644
--- a/ovsdb/raft-private.c
+++ b/ovsdb/raft-private.c
@@ -33,7 +33,7 @@ raft_address_validate(const char *address)
return NULL;
} else if (!strncmp(address, "ssl:", 4) || !strncmp(address, "tcp:", 4)) {
struct sockaddr_storage ss;
- if (!inet_parse_active(address + 4, -1, &ss)) {
+ if (!inet_parse_active(address + 4, -1, &ss, true)) {
return ovsdb_error(NULL, "%s: syntax error in address", address);
}
return NULL;
--
2.19.1

@ -1,70 +0,0 @@
From 72bb6af9f31f3d6a000a7f22f9a82939119f63af Mon Sep 17 00:00:00 2001
From: Justin Pettit <jpettit@ovn.org>
Date: Wed, 5 Sep 2018 16:51:09 -0700
Subject: [PATCH] ovn.at: Skip ACL rate-limiting test on slow/overloaded
systems.
In ACL rate-limiting test, we send three sets of 100 packets. One of
the sets drops packets at a rate of 10 per second, one at a rate of 5
per second, and one not at all. On my setup, it takes roughly 0.67
seconds to send those 300 packets, but we have reports of it taking over
15 seconds on others. The test was intended to allow some flexibility
in run-time, but it's very difficult to design a mechanism that can all
possibilities.
To prevent false test failures, this patch changes the test to check
the duration count of the meter, and if it's greater than nine seconds,
just skip the test.
Signed-off-by: Justin Pettit <jpettit@ovn.org>
Reported-by: Thomas Goirand <zigo@debian.org>
---
tests/ovn.at | 21 +++++++++++++++------
1 file changed, 15 insertions(+), 6 deletions(-)
diff --git a/tests/ovn.at b/tests/ovn.at
index 031b6ada0..61b4f8924 100644
--- a/tests/ovn.at
+++ b/tests/ovn.at
@@ -6500,15 +6500,25 @@ for i in `seq 1 100`; do
ovs-appctl netdev-dummy/receive lp1 'in_port(1),eth(src=f0:00:00:00:00:01,dst=f0:00:00:00:00:02),eth_type(0x0800),ipv4(src=192.168.1.2,dst=192.168.1.3,proto=6,tos=0,ttl=64,frag=no),tcp(src=7777,dst=82)'
done
+# The rate at which packets are sent is highly system-dependent, so we
+# can't count on precise drop counts. To work around that, we just
+# check that exactly 100 "http-acl3" actions were logged and that there
+# were more "http-acl1" actions than "http-acl2" ones.
+OVS_WAIT_UNTIL([ test 100 = $(grep -c 'http-acl3' hv/ovn-controller.log) ])
+
+# On particularly slow or overloaded systems, the transmission rate may
+# be lower than the configured meter rate. To prevent false test
+# failures, we check the duration count of the meter, and if it's
+# greater than nine seconds, just skip the test.
+d_secs=$(as hv ovs-ofctl -O OpenFlow13 meter-stats br-int | grep "meter:1" | sed 's/.* duration:\([[0-9]]\{1,\}\)\.[[0-9]]\+s .*/\1/')
+
+echo "Meter duration: $d_secs"
+AT_SKIP_IF([test $d_secs -gt 9])
+
# Print some information that may help debugging.
as hv ovs-appctl -t ovn-controller meter-table-list
as hv ovs-ofctl -O OpenFlow13 meter-stats br-int
-# The userspace meter implementation doesn't precisely enforce counts,
-# so we just check that exactly 100 "http-acl3" actions were logged and
-# that there were more "http-acl1" actions than "http-acl2" ones.
-OVS_WAIT_UNTIL([ test 100 = $(grep -c 'http-acl3' hv/ovn-controller.log) ])
-
n_acl1=$(grep -c 'http-acl1' hv/ovn-controller.log)
n_acl2=$(grep -c 'http-acl2' hv/ovn-controller.log)
n_acl3=$(grep -c 'http-acl3' hv/ovn-controller.log)
@@ -6516,7 +6526,6 @@ n_acl3=$(grep -c 'http-acl3' hv/ovn-controller.log)
AT_CHECK([ test $n_acl3 -gt $n_acl1 ], [0], [])
AT_CHECK([ test $n_acl1 -gt $n_acl2 ], [0], [])
-
OVN_CLEANUP([hv])
AT_CLEANUP
--
2.17.1

@ -1,38 +0,0 @@
From 949758946767ff79b4c3eb5eca755c6cf21643e3 Mon Sep 17 00:00:00 2001
From: Timothy Redaelli <tredaelli@redhat.com>
Date: Sun, 9 Sep 2018 14:20:02 +0200
Subject: [PATCH] ovs-save: Don't always include the default flow during
restore
Currently the default flow (actions=NORMAL) is present in the flow table after
the flow table is restored also when the default flow is removed.
This commit changes the behaviour of the "ovs-save save-flows" command to use
"replace-flows" instead of "add-flows" to restore the flows. This is needed in
order to always have the new flow table as it was before restoring it.
Reported-by: Flavio Leitner <fbl@sysclose.org>
Reported-at: https://bugzilla.redhat.com/show_bug.cgi?id=1626096
Signed-off-by: Timothy Redaelli <tredaelli@redhat.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Signed-off-by: Gurucharan Shetty <guru@ovn.org>
---
utilities/ovs-save | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/utilities/ovs-save b/utilities/ovs-save
index ea8fb6a45..2294583d6 100755
--- a/utilities/ovs-save
+++ b/utilities/ovs-save
@@ -121,7 +121,7 @@ save_flows () {
cnt++;printf "{class="$1",type="$2",len="$3"}->"$4}'
echo "'"
- printf "%s" "ovs-ofctl -O $ofp_version add-flows ${bridge} " \
+ printf "%s" "ovs-ofctl -O $ofp_version replace-flows ${bridge} " \
"\"$workdir/$bridge.flows.dump\""
# If possible, use OpenFlow 1.4 atomic bundle transaction to add flows
--
2.17.1

@ -1,41 +0,0 @@
Date: Sun, 2 Sep 2018 09:30:43 -0700
From: Ben Pfaff <blp@ovn.org>
To: dev@openvswitch.org
Cc: Ben Pfaff <blp@ovn.org>
Subject: [ovs-dev] [PATCH] ofproto-dpif-xlate: Fix translation of groups with
no buckets.
Message-Id: <20180902163043.11210-1-blp@ovn.org>
List-Id: <ovs-dev.openvswitch.org>
X-Bogosity: Unsure, tests=bogofilter, spamicity=0.500000, version=1.2.4
A group can have no buckets, in which case ovs_list_back() assert-fails.
This fixes the problem.
Found by OFTest.
Fixes: a04e58881e25 ("ofproto-dpif-xlate: Simplify translation for groups.")
Signed-off-by: Ben Pfaff <blp@ovn.org>
---
ofproto/ofproto-dpif-xlate.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/ofproto/ofproto-dpif-xlate.c b/ofproto/ofproto-dpif-xlate.c
index e26f6c8f554a..507e14dd0d00 100644
--- a/ofproto/ofproto-dpif-xlate.c
+++ b/ofproto/ofproto-dpif-xlate.c
@@ -4488,7 +4488,7 @@ xlate_group_action__(struct xlate_ctx *ctx, struct group_dpif *group,
bool is_last_action)
{
if (group->up.type == OFPGT11_ALL || group->up.type == OFPGT11_INDIRECT) {
- struct ovs_list *last_bucket = ovs_list_back(&group->up.buckets);
+ struct ovs_list *last_bucket = group->up.buckets.prev;
struct ofputil_bucket *bucket;
LIST_FOR_EACH (bucket, list_node, &group->up.buckets) {
bool is_last_bucket = &bucket->list_node == last_bucket;
--
2.16.1
_______________________________________________
dev mailing list
dev@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

@ -58,11 +58,16 @@
%global with_python3 0 %global with_python3 0
%endif %endif
%if 0%{?centos} == 7
# Carried over from 2.6.1 CBS builds, introduced to win over 2.6.90
Epoch: 1
%endif
Name: openvswitch Name: openvswitch
Summary: Open vSwitch daemon/database/utilities Summary: Open vSwitch daemon/database/utilities
URL: http://www.openvswitch.org/ URL: http://www.openvswitch.org/
Version: 2.10.0 Version: 2.10.1
Release: 4%{?commit0:.%{date}git%{shortcommit0}}%{?dist} Release: 1%{?commit0:.%{date}git%{shortcommit0}}%{?dist}
# Nearly all of openvswitch is ASL 2.0. The bugtool is LGPLv2+, and the # Nearly all of openvswitch is ASL 2.0. The bugtool is LGPLv2+, and the
# lib/sflow*.[ch] files are SISSL # lib/sflow*.[ch] files are SISSL
@ -81,25 +86,7 @@ Source: http://openvswitch.org/releases/%{name}-%{version}.tar.gz
# ovs-patches # ovs-patches
# OVS (including OVN) backports (0 - 300) # OVS (including OVN) backports (0 - 300)
Patch10: 0001-ovn-nbctl-Fix-the-ovn-nbctl-test-LBs-daemon-which-fa.patch
Patch010: ofproto-dpif-xlate_Fix_translation_of_groups_with_no_bu.patch
Patch020: 0001-ovs-save-Don-t-always-include-the-default-flow-durin.patch
# Bug 1631797
Patch030: 0001-dpif-netdev-Add-round-robin-based-rxq-to-pmd-assignm.patch
# Bug 1565205
Patch040: 0001-dpif-netdev-Avoid-reordering-of-packets-in-a-batch-w.patch
# Bug 1634015
Patch050: 0001-dpif-netlink-don-t-allocate-per-thread-netlink-socke.patch
Patch051: 0001-dpif-Remove-support-for-multiple-queues-per-port.patch
# Bug 1635344
Patch070: 0001-OVN-add-CT_LB-action-to-ovn-trace.patch
Patch080: 0001-ovn.at-Skip-ACL-rate-limiting-test-on-slow-overloade.patch
BuildRequires: gcc gcc-c++ make BuildRequires: gcc gcc-c++ make
BuildRequires: autoconf automake libtool BuildRequires: autoconf automake libtool
@ -134,6 +121,11 @@ BuildRequires: libcap-ng libcap-ng-devel
%if %{with dpdk} %if %{with dpdk}
%ifarch %{dpdkarches} %ifarch %{dpdkarches}
BuildRequires: dpdk-devel libpcap-devel numactl-devel BuildRequires: dpdk-devel libpcap-devel numactl-devel
# Currently DPDK on Extras/AppStream includes the mlx{4,5} glue libraries, so
# libibverbs is needed to run the tests (make check).
%if 0%{?rhel}
BuildRequires: libibverbs >= 15
%endif
%endif %endif
%endif %endif
@ -162,7 +154,11 @@ License: ASL 2.0
Requires: %{_py2} %{_py2}-six Requires: %{_py2} %{_py2}-six
%if "%{_py2}" == "python2" %if "%{_py2}" == "python2"
Obsoletes: python-openvswitch < 2.6.1-2 Obsoletes: python-openvswitch < 2.6.1-2
Provides: python-openvswitch = %{version}-%{release} Provides: python-openvswitch = %{?epoch:%{epoch}:}%{version}-%{release}
%endif
%if 0%{?centos} == 7
Obsoletes: python2-openvswitch
Provides: python2-openvswitch = %{?epoch:%{epoch}:}%{version}-%{release}
%endif %endif
%description -n %{_py2}-openvswitch %description -n %{_py2}-openvswitch
@ -176,7 +172,7 @@ License: ASL 2.0
Requires: python3 python3-six Requires: python3 python3-six
%if ! %{with_python2} %if ! %{with_python2}
Obsoletes: python-openvswitch < 2.10.0-6 Obsoletes: python-openvswitch < 2.10.0-6
Provides: python-openvswitch = %{version}-%{release} Provides: python-openvswitch = %{?epoch:%{epoch}:}%{version}-%{release}
%endif %endif
%description -n python3-openvswitch %description -n python3-openvswitch
@ -188,10 +184,10 @@ Summary: Open vSwitch testing utilities
License: ASL 2.0 License: ASL 2.0
BuildArch: noarch BuildArch: noarch
%if %{with_python2} %if %{with_python2}
Requires: %{_py2}-openvswitch = %{version}-%{release} Requires: %{_py2}-openvswitch = %{?epoch:%{epoch}:}%{version}-%{release}
Requires: %{_py2} %{_py2}-twisted%{?rhel:-web} Requires: %{_py2} %{_py2}-twisted%{?rhel:-web}
%else %else
Requires: python3-openvswitch = %{version}-%{release} Requires: python3-openvswitch = %{?epoch:%{epoch}:}%{version}-%{release}
%endif %endif
%description test %description test
@ -698,7 +694,7 @@ chown -R openvswitch:openvswitch /etc/openvswitch
%else %else
%exclude %{_mandir}/man8/ovs-dpctl-top.8* %exclude %{_mandir}/man8/ovs-dpctl-top.8*
%endif %endif
%if 0%{?rhel} && 0%{?rhel} <= 7 %if (0%{?rhel} && 0%{?rhel} <= 7) || (0%{?fedora} && 0%{?fedora} < 29)
%{_sysconfdir}/sysconfig/network-scripts/ifup-ovs %{_sysconfdir}/sysconfig/network-scripts/ifup-ovs
%{_sysconfdir}/sysconfig/network-scripts/ifdown-ovs %{_sysconfdir}/sysconfig/network-scripts/ifdown-ovs
%endif %endif
@ -751,6 +747,9 @@ chown -R openvswitch:openvswitch /etc/openvswitch
%{_unitdir}/ovn-controller-vtep.service %{_unitdir}/ovn-controller-vtep.service
%changelog %changelog
* Wed Nov 28 2018 Timothy Redaelli <tredaelli@redhat.com> - 2.10.1-1
- Rebase to 2.10.1
* Wed Nov 21 2018 Timothy Redaelli <tredaelli@redhat.com> - 2.10.0-4 * Wed Nov 21 2018 Timothy Redaelli <tredaelli@redhat.com> - 2.10.0-4
- Fix C JSON library creation on Fedora Rawhide and exit if shared library cannot be created - Fix C JSON library creation on Fedora Rawhide and exit if shared library cannot be created

Loading…
Cancel
Save