Skip to content

Commit 91531e4

Browse files
committed
vpp-manager: rebind interfaces to kernel if link is missing
If VPP is killed abruptly (e.g. due to OOM), it does not shut down gracefully and does not restore interface bindings. As a result, interfaces may remain bound to a DPDK driver and no longer appear as Linux network devices. When the expected interface is not found, attempt to rebind PCI devices back to the kernel driver and retry the lookup once. Because we don't die gracefully, pingCalicoVPP() won't be called and agent will never restart. So we call it in the beginning to kill previous stale agents if any.
1 parent 5cb69df commit 91531e4

File tree

2 files changed

+66
-1
lines changed

2 files changed

+66
-1
lines changed

config/config.go

Lines changed: 56 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ import (
2222
"net"
2323
"os"
2424
"os/exec"
25+
"path/filepath"
2526
"regexp"
2627
"sort"
2728
"strconv"
@@ -726,13 +727,67 @@ type LinuxInterfaceState struct {
726727
TapSwIfIndex uint32
727728
}
728729

730+
func bindPCIDevicesToKernel() error {
731+
drivers := []string{"igb_uio", "uio_pci_generic", "vfio-pci"}
732+
removed := false
733+
734+
for _, driver := range drivers {
735+
pattern := filepath.Join("/sys/bus/pci/drivers", driver, "*")
736+
matches, err := filepath.Glob(pattern)
737+
if err != nil {
738+
return err
739+
}
740+
741+
for _, f := range matches {
742+
configPath := filepath.Join(f, "config")
743+
744+
// Skip if config does not exist
745+
if _, err := os.Stat(configPath); err != nil {
746+
continue
747+
}
748+
749+
// Check if config file is in use via fuser
750+
cmd := exec.Command("fuser", "-s", configPath)
751+
if err := cmd.Run(); err == nil {
752+
// fuser found a user, skip
753+
continue
754+
}
755+
756+
// Write "1" to remove file
757+
removePath := filepath.Join(f, "remove")
758+
if err := os.WriteFile(removePath, []byte("1"), 0200); err != nil {
759+
return err
760+
}
761+
762+
removed = true
763+
}
764+
}
765+
766+
// If any device was removed, rescan PCI bus
767+
if removed {
768+
if err := os.WriteFile("/sys/bus/pci/rescan", []byte("1"), 0200); err != nil {
769+
return err
770+
}
771+
}
772+
773+
return nil
774+
}
775+
729776
func LoadInterfaceConfigFromLinux(interfaceName string) (*LinuxInterfaceState, error) {
730777
conf := LinuxInterfaceState{
731778
TapSwIfIndex: ^uint32(0), // in case we forget to set it
732779
}
733780
link, err := netlink.LinkByName(interfaceName)
734781
if err != nil {
735-
return nil, errors.Wrapf(err, "cannot find interface named %s", interfaceName)
782+
// attempt binding PCI devices to kernel
783+
bindErr := bindPCIDevicesToKernel()
784+
if bindErr != nil {
785+
return nil, errors.Wrapf(err, "cannot find interface named %s, cannot bind pci devices to kernel: %v", interfaceName, bindErr)
786+
}
787+
link, err = netlink.LinkByName(interfaceName)
788+
if err != nil {
789+
return nil, errors.Wrapf(err, "cannot find interface named %s after binding devices to kernel", interfaceName)
790+
}
736791
}
737792
conf.IsUp = (link.Attrs().Flags & net.FlagUp) != 0
738793
if conf.IsUp {

vpp-manager/vpp_runner.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -944,6 +944,16 @@ func (v *VppRunner) runVpp() (err error) {
944944
}
945945
vppProcess = vppCmd.Process
946946
}
947+
/**
948+
* Ensure any stale Calico VPP agents are terminated by calling pingCalicoVPP().
949+
* This handles cases where a previous VPP instance was killed abruptly and
950+
* didn't restore its configuration.
951+
* Must be done before writing the info file to make sure the new agent
952+
* doesn't respond to SIGUSR1 and avoid killing it */
953+
err = v.pingCalicoVpp()
954+
if err != nil {
955+
log.Errorf("Error pinging calico-vpp: %v", err)
956+
}
947957

948958
defer func() {
949959
if r := recover(); r != nil {

0 commit comments

Comments
 (0)