SoFunction
Updated on 2025-04-14

Linode Xen grsecurity >= 4.3 Crash issue

Since Linux 4.3, when using PaX/gsecurity on Linode, the kernel crashes immediately after being executed by pv-grub. Since the crash occurs very early after startup, there is no log for debugging, and the company does not have the cover open, and there is no way to get meaningful debugging information on the mother machine. This has resulted in the cover's VPS core being locked at 4.2.7 since December last year. Because I don’t know when Linode Tokyo computer room would migrate from Xen to KVM in June 2016, I didn’t spend any effort trying to debug this problem.

However, all hardware doubled during Linode's anniversary this year, except for the Tokyo computer room. According to the latest official statement, the new computer room is optimistically expected to be launched in the fourth quarter. To solve the kernel problem, we have to put it on the agenda. First, we have manually repaired many high-risk CVE vulnerabilities, and then used diff to toss for a long time. The kernel will always die immediately after starting. And since grsecurity does not provide git source, git bisect is also impossible. The only available tool is the Linux 4.2.7/patch file, and the Linux 4.3.3/patch file.

When reading code differences, a big challenge is how to distinguish between modifications of upstream kernels and modifications of downstream PaX/gsecurity patches. Directly comparing patch files can lead to lost code context, making the code's intention incomprehensible. Finally, the lid intends to write a tool called metadiff, which automatically compares and removes the code segments that appear in the upstream, so as to compare only the code of PaX/gsecurity. Even after thinking about the name, it is called metadiff, but it has not taken action.

Until last month when I was chatting with Shawn, I mentioned that it was not impossible to install Xen by myself; so on Saturday, I finally hit Debian + Xen in the VirutalBox virtual machine, and started a virtual machine in Xen, and it was true that the kernel crashed traceback was quickly obtained.

rip: ffffffff8100b2b0 pmu_msr_read+0x10
flags: 00000282 i s nz
rsp: ffffffff81aeff30
rax: 8000000000000000  rcx: 0000000000000001  rdx: ffffffff81aeffcc
rbx: 00000000c0000080  rsi: ffffffff81aeffa0  rdi: 00000000c0000080
rbp: ffffffff81aeffa0  r8: 0000000000000001  r9: 00000000ffffffff
r10: ffffffff81cf9000  r11: 0000000000000000  r12: ffffffff81aeffcc
r13: ffffffff81aeffc4  r14: ffffffff81aeffc0  r15: 6f73b764afec1c9d
 cs: e033    ss: e02b    ds: 0000    es: 0000
 fs: 0000 @ 0000000000000000
 gs: 0000 @ 0000000000000000/0000000000000000
Code (instr addr ffffffff8100b2b0)
00 00 00 00 00 41 54 49 89 d4 55 48 89 f5 53 89 fb 48 83 ec 10 <65> 48 8b 04 25 28 00 00 00 48 89 


Stack:
 0000000000000001 0000000000000000 0000000000000000 ffffffff8100b2b0
 000000010000e030 0000000000010082 ffffffff81aeff70 000000000000e02b
 0000000000000000 0000000000000000 00000000c0000080 ffffffff81aeffcc
 ffffffff81aeffc8 ffffffff810041c8 ffffffff81aeffc8 ffffffff81aeffcc

Call Trace:
 [<ffffffff8100b2b0>] pmu_msr_read+0x10 <--
 [<ffffffff8100b2b0>] pmu_msr_read+0x10
 [<ffffffff810041c8>] xen_read_msr_safe+0x18
 [<ffffffff81be93eb>] xen_start_kernel+0x1b9

oh? It can be seen that the kernel crashed soon in xen_start_kernel. This is /* First C function to be called on Xen boot */. It crashed so early, and it is not surprising that the error log cannot be seen. Let's see what changes have occurred between xen_read_msr and pmu_msr_read between 4.2 and 4.3:

--- ../../4.2.7/linux-4.2.7/arch/x86/xen/  2016-09-11 00:44:12.010022936 +0800
+++ arch/x86/xen/  2015-12-15 13:41:43.000000000 +0800

@@ -1030,6 +1034,9 @@ static u64 xen_read_msr_safe(unsigned in
 {
    u64 val;

+    if (pmu_msr_read(msr, &val, err))
+        return val;
+
    val = native_read_msr_safe(msr, err);
    switch (msr) {
    case MSR_IA32_APICBASE:
@@ -1074,9 +1081,11 @@ static int xen_write_msr_safe(unsigned i
        /* Fast syscall setup is all done in hypercalls, so
          these are all ignored. Stub them out here to stop
          Xen console noise. */
+        break;

    default:
-        ret = native_write_msr_safe(msr, low, high);
+        if (!pmu_msr_write(msr, low, high, &ret))
+            ret = native_write_msr_safe(msr, low, high);
    }

    return ret;

It can be seen that pmu_msr_read is completely new, and you can continue to track it down using git blame.

xen/PMU: Initialization code for Xen PMU 65d0cf0be79feebeb19e7626fd3ed41ae73f642d
xen/PMU: Describe vendor-specific PMU registers e27b72df01109c689062caeba1defa013b759e0e
xen/PMU: Intercept PMU-related MSR and APIC accesses 6b08cd6328c58a2ae190c5ee03a2ffcab5ef828e
xen/PMU: PMU emulation code bf6dfb154d935725c9a2005033ca33017b9df439

I found that PMU is a new feature of Xen entering the mainline kernel in 4.3, so the solution is very simple. Just cancel both bf6dfb and 6b08cd. Let PaX Team and spender investigate the next thing. The final patch is:

diff -uprN linux-4.7.3-hardened/arch/x86/xen/ linux-4.7./arch/x86/xen/
  --- linux-4.7.3-hardened/arch/x86/xen/  2016-07-24 19:23:50.000000000 +0000
  +++ linux-4.7./arch/x86/xen/  2016-09-10 20:05:21.450647009 +0000
  @@ -7,7 +7,6 @@
   #include <xen/>
   #include <xen/interface/>
   #include ""
  -#include ""
   #include ""

   static unsigned int xen_io_apic_read(unsigned apic, unsigned reg)
  @@ -73,10 +72,8 @@ static u32 xen_apic_read(u32 reg)

   static void xen_apic_write(u32 reg, u32 val)
   {
  -  if (reg == APIC_LVTPC) {
  -   (void)pmu_apic_update(reg);
  +  if (reg == APIC_LVTPC)
      return;
  -  }

    /* Warn to see if there's any stray references */
    WARN(1,"register: %x, value: %x\n", reg, val);
  diff -uprN linux-4.7.3-hardened/arch/x86/xen/ linux-4.7./arch/x86/xen/
  --- linux-4.7.3-hardened/arch/x86/xen/  2016-09-10 19:59:29.237313676 +0000
  +++ linux-4.7./arch/x86/xen/  2016-09-10 20:06:49.683980342 +0000
  @@ -1031,9 +1031,6 @@ static u64 xen_read_msr_safe(unsigned in
   {
    u64 val;

  -  if (pmu_msr_read(msr, &val, err))
  -   return val;
  -
    val = native_read_msr_safe(msr, err);
    switch (msr) {
    case MSR_IA32_APICBASE:
  @@ -1081,13 +1078,17 @@ static int xen_write_msr_safe(unsigned i
      break;

    default:
  -   if (!pmu_msr_write(msr, low, high, &ret))
  -     ret = native_write_msr_safe(msr, low, high);
  +   ret = native_write_msr_safe(msr, low, high);
    }

    return ret;
   }

  +unsigned long long xen_read_pmc(int counter)
  +{
  +  return 0;
  +}
  +
   static u64 xen_read_msr(unsigned int msr)
   {
    /*
  diff -uprN linux-4.7.3-hardened/arch/x86/xen/ linux-4.7./arch/x86/xen/
  --- linux-4.7.3-hardened/arch/x86/xen/  2016-07-24 19:23:50.000000000 +0000
  +++ linux-4.7./arch/x86/xen/  2016-09-10 20:05:21.450647009 +0000
  @@ -13,20 +13,11 @@
   /* x86_pmu.handle_irq definition */
   #include "../events/perf_event.h"

  -#define XENPMU_IRQ_PROCESSING  1
  -struct xenpmu {
  -  /* Shared page between hypervisor and domain */
  -  struct xen_pmu_data *xenpmu_data;

  -  uint8_t flags;
  -};
  -static DEFINE_PER_CPU(struct xenpmu, xenpmu_shared);
  -#define get_xenpmu_data()  (this_cpu_ptr(&xenpmu_shared)->xenpmu_data)
  -#define get_xenpmu_flags()  (this_cpu_ptr(&xenpmu_shared)->flags)
  -
  -/* Macro for computing address of a PMU MSR bank */
  -#define field_offset(ctxt, field) ((void *)((uintptr_t)ctxt + \
  -          (uintptr_t)ctxt->field))
  +/* Shared page between hypervisor and domain */
  +static DEFINE_PER_CPU(struct xen_pmu_data *, xenpmu_shared);
  +#define get_xenpmu_data()  per_cpu(xenpmu_shared, smp_processor_id())
  +

   /* AMD PMU */
   #define F15H_NUM_COUNTERS  6
  @@ -60,8 +51,6 @@ static __read_mostly int amd_num_counter
   /* Alias registers (0x4c1) for full-width writes to PMCs */
   #define MSR_PMC_ALIAS_MASK     (~(MSR_IA32_PERFCTR0 ^ MSR_IA32_PMC0))

  -#define INTEL_PMC_TYPE_SHIFT    30
  -
   static __read_mostly int intel_num_arch_counters, intel_num_fixed_counters;


  @@ -178,232 +167,6 @@ static int is_intel_pmu_msr(u32 msr_inde
    }
   }

  -static bool xen_intel_pmu_emulate(unsigned int msr, u64 *val, int type,
  -       int index, bool is_read)
  -{
  -  uint64_t *reg = NULL;
  -  struct xen_pmu_intel_ctxt *ctxt;
  -  uint64_t *fix_counters;
  -  struct xen_pmu_cntr_pair *arch_cntr_pair;
  -  struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
  -  uint8_t xenpmu_flags = get_xenpmu_flags();
  -
  -
  -  if (!xenpmu_data || !(xenpmu_flags & XENPMU_IRQ_PROCESSING))
  -   return false;
  -
  -  ctxt = &xenpmu_data->;
  -
  -  switch (msr) {
  -  case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
  -   reg = &ctxt->global_ovf_ctrl;
  -   break;
  -  case MSR_CORE_PERF_GLOBAL_STATUS:
  -   reg = &ctxt->global_status;
  -   break;
  -  case MSR_CORE_PERF_GLOBAL_CTRL:
  -   reg = &ctxt->global_ctrl;
  -   break;
  -  case MSR_CORE_PERF_FIXED_CTR_CTRL:
  -   reg = &ctxt->fixed_ctrl;
  -   break;
  -  default:
  -   switch (type) {
  -   case MSR_TYPE_COUNTER:
  -     fix_counters = field_offset(ctxt, fixed_counters);
  -     reg = &fix_counters[index];
  -     break;
  -   case MSR_TYPE_ARCH_COUNTER:
  -     arch_cntr_pair = field_offset(ctxt, arch_counters);
  -     reg = &arch_cntr_pair[index].counter;
  -     break;
  -   case MSR_TYPE_ARCH_CTRL:
  -     arch_cntr_pair = field_offset(ctxt, arch_counters);
  -     reg = &arch_cntr_pair[index].control;
  -     break;
  -   default:
  -     return false;
  -   }
  -  }
  -
  -  if (reg) {
  -   if (is_read)
  -     *val = *reg;
  -   else {
  -     *reg = *val;
  -
  -     if (msr == MSR_CORE_PERF_GLOBAL_OVF_CTRL)
  -      ctxt->global_status &= (~(*val));
  -   }
  -   return true;
  -  }
  -
  -  return false;
  -}
  -
  -static bool xen_amd_pmu_emulate(unsigned int msr, u64 *val, bool is_read)
  -{
  -  uint64_t *reg = NULL;
  -  int i, off = 0;
  -  struct xen_pmu_amd_ctxt *ctxt;
  -  uint64_t *counter_regs, *ctrl_regs;
  -  struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
  -  uint8_t xenpmu_flags = get_xenpmu_flags();
  -
  -  if (!xenpmu_data || !(xenpmu_flags & XENPMU_IRQ_PROCESSING))
  -   return false;
  -
  -  if (k7_counters_mirrored &&
  -    ((msr >= MSR_K7_EVNTSEL0) && (msr <= MSR_K7_PERFCTR3)))
  -   msr = get_fam15h_addr(msr);
  -
  -  ctxt = &xenpmu_data->;
  -  for (i = 0; i < amd_num_counters; i++) {
  -   if (msr == amd_ctrls_base + off) {
  -     ctrl_regs = field_offset(ctxt, ctrls);
  -     reg = &ctrl_regs[i];
  -     break;
  -   } else if (msr == amd_counters_base + off) {
  -     counter_regs = field_offset(ctxt, counters);
  -     reg = &counter_regs[i];
  -     break;
  -   }
  -   off += amd_msr_step;
  -  }
  -
  -  if (reg) {
  -   if (is_read)
  -     *val = *reg;
  -   else
  -     *reg = *val;
  -
  -   return true;
  -  }
  -  return false;
  -}
  -
  -bool pmu_msr_read(unsigned int msr, uint64_t *val, int *err)
  -{
  -  if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
  -   if (is_amd_pmu_msr(msr)) {
  -     if (!xen_amd_pmu_emulate(msr, val, 1))
  -      *val = native_read_msr_safe(msr, err);
  -     return true;
  -   }
  -  } else {
  -   int type, index;
  -
  -   if (is_intel_pmu_msr(msr, &type, &index)) {
  -     if (!xen_intel_pmu_emulate(msr, val, type, index, 1))
  -      *val = native_read_msr_safe(msr, err);
  -     return true;
  -   }
  -  }
  -
  -  return false;
  -}
  -
  -bool pmu_msr_write(unsigned int msr, uint32_t low, uint32_t high, int *err)
  -{
  -  uint64_t val = ((uint64_t)high << 32) | low;
  -
  -  if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
  -   if (is_amd_pmu_msr(msr)) {
  -     if (!xen_amd_pmu_emulate(msr, &val, 0))
  -      *err = native_write_msr_safe(msr, low, high);
  -     return true;
  -   }
  -  } else {
  -   int type, index;
  -
  -   if (is_intel_pmu_msr(msr, &type, &index)) {
  -     if (!xen_intel_pmu_emulate(msr, &val, type, index, 0))
  -      *err = native_write_msr_safe(msr, low, high);
  -     return true;
  -   }
  -  }
  -
  -  return false;
  -}
  -
  -static unsigned long long xen_amd_read_pmc(int counter)
  -{
  -  struct xen_pmu_amd_ctxt *ctxt;
  -  uint64_t *counter_regs;
  -  struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
  -  uint8_t xenpmu_flags = get_xenpmu_flags();
  -
  -  if (!xenpmu_data || !(xenpmu_flags & XENPMU_IRQ_PROCESSING)) {
  -   uint32_t msr;
  -   int err;
  -
  -   msr = amd_counters_base + (counter * amd_msr_step);
  -   return native_read_msr_safe(msr, &err);
  -  }
  -
  -  ctxt = &xenpmu_data->;
  -  counter_regs = field_offset(ctxt, counters);
  -  return counter_regs[counter];
  -}
  -
  -static unsigned long long xen_intel_read_pmc(int counter)
  -{
  -  struct xen_pmu_intel_ctxt *ctxt;
  -  uint64_t *fixed_counters;
  -  struct xen_pmu_cntr_pair *arch_cntr_pair;
  -  struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
  -  uint8_t xenpmu_flags = get_xenpmu_flags();
  -
  -  if (!xenpmu_data || !(xenpmu_flags & XENPMU_IRQ_PROCESSING)) {
  -   uint32_t msr;
  -   int err;
  -
  -   if (counter & (1 << INTEL_PMC_TYPE_SHIFT))
  -     msr = MSR_CORE_PERF_FIXED_CTR0 + (counter & 0xffff);
  -   else
  -     msr = MSR_IA32_PERFCTR0 + counter;
  -
  -   return native_read_msr_safe(msr, &err);
  -  }
  -
  -  ctxt = &xenpmu_data->;
  -  if (counter & (1 << INTEL_PMC_TYPE_SHIFT)) {
  -   fixed_counters = field_offset(ctxt, fixed_counters);
  -   return fixed_counters[counter & 0xffff];
  -  }
  -
  -  arch_cntr_pair = field_offset(ctxt, arch_counters);
  -  return arch_cntr_pair[counter].counter;
  -}
  -
  -unsigned long long xen_read_pmc(int counter)
  -{
  -  if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
  -   return xen_amd_read_pmc(counter);
  -  else
  -   return xen_intel_read_pmc(counter);
  -}
  -
  -int pmu_apic_update(uint32_t val)
  -{
  -  int ret;
  -  struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
  -
  -  if (!xenpmu_data) {
  -   pr_warn_once("%s: pmudata not initialized\n", __func__);
  -   return -EINVAL;
  -  }
  -
  -  xenpmu_data->.lapic_lvtpc = val;
  -
  -  if (get_xenpmu_flags() & XENPMU_IRQ_PROCESSING)
  -   return 0;
  -
  -  ret = HYPERVISOR_xenpmu_op(XENPMU_lvtpc_set, NULL);
  -
  -  return ret;
  -}
  -
   /* perf callbacks */
   static int xen_is_in_guest(void)
   {
  @@ -476,37 +239,26 @@ static void xen_convert_regs(const struc

   irqreturn_t xen_pmu_irq_handler(int irq, void *dev_id)
   {
  -  int err, ret = IRQ_NONE;
  +  int ret = IRQ_NONE;
    struct pt_regs regs;
    const struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
  -  uint8_t xenpmu_flags = get_xenpmu_flags();

    if (!xenpmu_data) {
      pr_warn_once("%s: pmudata not initialized\n", __func__);
      return ret;
    }

  -  this_cpu_ptr(&xenpmu_shared)->flags =
  -   xenpmu_flags | XENPMU_IRQ_PROCESSING;
    xen_convert_regs(&xenpmu_data->, ®s,
        xenpmu_data->pmu.pmu_flags);
    if (x86_pmu.handle_irq(®s))
      ret = IRQ_HANDLED;

  -  /* Write out cached context to HW */
  -  err = HYPERVISOR_xenpmu_op(XENPMU_flush, NULL);
  -  this_cpu_ptr(&xenpmu_shared)->flags = xenpmu_flags;
  -  if (err) {
  -   pr_warn_once("%s: failed hypercall, err: %d\n", __func__, err);
  -   return IRQ_NONE;
  -  }
  -
    return ret;
   }

   bool is_xen_pmu(int cpu)
   {
  -  return (get_xenpmu_data() != NULL);
  +  return (per_cpu(xenpmu_shared, cpu) != NULL);
   }

   void xen_pmu_init(int cpu)
  @@ -536,8 +288,7 @@ void xen_pmu_init(int cpu)
    if (err)
      goto fail;

  -  per_cpu(xenpmu_shared, cpu).xenpmu_data = xenpmu_data;
  -  per_cpu(xenpmu_shared, cpu).flags = 0;
  +  per_cpu(xenpmu_shared, cpu) = xenpmu_data;

    if (cpu == 0) {
      perf_register_guest_info_callbacks(&xen_guest_cbs);
  @@ -565,6 +316,6 @@ void xen_pmu_finish(int cpu)

    (void)HYPERVISOR_xenpmu_op(XENPMU_finish, &xp);

  -  free_pages((unsigned long)per_cpu(xenpmu_shared, cpu).xenpmu_data, 0);
  -  per_cpu(xenpmu_shared, cpu).xenpmu_data = NULL;
  +  free_pages((unsigned long)per_cpu(xenpmu_shared, cpu), 0);
  +  per_cpu(xenpmu_shared, cpu) = NULL;
   }
  diff -uprN linux-4.7.3-hardened/arch/x86/xen/ linux-4.7./arch/x86/xen/
  --- linux-4.7.3-hardened/arch/x86/xen/  2016-07-24 19:23:50.000000000 +0000
  +++ linux-4.7./arch/x86/xen/  2016-09-10 20:05:21.453980342 +0000
  @@ -7,9 +7,5 @@ irqreturn_t xen_pmu_irq_handler(int irq,
   void xen_pmu_init(int cpu);
   void xen_pmu_finish(int cpu);
   bool is_xen_pmu(int cpu);
  -bool pmu_msr_read(unsigned int msr, uint64_t *val, int *err);
  -bool pmu_msr_write(unsigned int msr, uint32_t low, uint32_t high, int *err);
  -int pmu_apic_update(uint32_t reg);
  -unsigned long long xen_read_pmc(int counter);

   #endif /* __XEN_PMU_H */

After patching it and compiling the kernel, the kernel locked by the smart coin was indeed successfully upgraded.

$ uname -r
4.7.3-hardened

Update: The official has fixed the issue in grsecurity-3.1-4.7., and this workaround is no longer needed.