x86: Add TSS-based protection domain support

This patch extends the protection domain framework with an additional
plugin to use Task-State Segment (TSS) structures to offload much of
the work of switching protection domains to the CPU.  This can save
space compared to paging, since paging requires two 4KiB page tables
and one 32-byte page table plus one whole-system TSS and an additional
32-byte data structure for each protection domain, whereas the
approach implemented by this patch just requires a 128-byte data
structure for each protection domain.  Only a small number of
protection domains will typically be used, so
n * 128 < 8328 + (n * 32).

For additional information, please refer to cpu/x86/mm/README.md.

GCC 6 is introducing named address spaces for the FS and GS segments
[1].  LLVM Clang also provides address spaces for the FS and GS
segments [2].  This patch also adds support to the multi-segment X86
memory management subsystem for using these features instead of inline
assembly blocks, which enables type checking to detect some address
space mismatches.

[1] https://gcc.gnu.org/onlinedocs/gcc/Named-Address-Spaces.html
[2] http://llvm.org/releases/3.3/tools/clang/docs/LanguageExtensions.html#target-specific-extensions
master-31012017
Michael LeMay 2015-08-07 15:43:10 -07:00
parent 3908253038
commit 4cdb7ba9b6
34 changed files with 1883 additions and 166 deletions

View File

@ -20,6 +20,11 @@ CFLAGS += -DX86_CONF_USE_INVLPG
endif
# This matches the definition of X86_CONF_PROT_DOMAINS__PAGING in prot-domains.h:
CFLAGS += -DX86_CONF_PROT_DOMAINS=1
else ifeq ($(X86_CONF_PROT_DOMAINS),tss)
# This matches the definition of X86_CONF_PROT_DOMAINS__TSS in prot-domains.h:
CFLAGS += -DX86_CONF_PROT_DOMAINS=2
X86_CONF_MULTI_SEG = 1
CONTIKI_SOURCEFILES += tss-prot-domains-asm.S
else
$(error Unrecognized setting for X86_CONF_PROT_DOMAINS: \
$(X86_CONF_PROT_DOMAINS). See cpu/x86/mm/README.md for \
@ -30,6 +35,20 @@ ifeq ($(X86_CONF_SYSCALLS_INT),1)
CONTIKI_SOURCEFILES += syscalls-int-asm.S tss.c
endif
ifeq ($(X86_CONF_MULTI_SEG),1)
LINKERSCRIPT_SFX = _multi_seg
CONTIKI_SOURCEFILES += multi-segment.c
# Due to the way the multi-segment implementation of protection domains define
# tightly-bounded stack segments, the base pointer register cannot be used as
# a general-purpose register in all circumstances. The stack segment is used
# by default for a data access that uses the base pointer as the base register
# to compute the address. If the data referenced by the base pointer is not
# on the stack, then the access will fail. Thus, it is necessary to disable
# the omit-frame-pointer optimization. See mm/README.md for more details of
# how multi-segment protection domains are implemented.
CFLAGS += -fno-omit-frame-pointer
endif
endif
CFLAGS += -m32 -march=i586 -mtune=i586

View File

@ -45,5 +45,17 @@
.global start
start:
cli
#if X86_CONF_PROT_DOMAINS == X86_CONF_PROT_DOMAINS__TSS
/* TSS-based protection domains use a multi-segment model that defines
* tight bounds around stacks. That means that the bottom of the stack
* has an offset of 0, which is the address of the stacks_main symbol.
* The following code computes the physical load address of the top of
* the stack, which is what should be initially used as the stack
* pointer while the flat memory model is in use.
*/
lea _sdata_addr, %eax
lea (stacks_main + STACKS_SIZE_MAIN)(%eax), %esp
#else
mov $(stacks_main + STACKS_SIZE_MAIN), %esp
#endif
call cpu_boot_stage0

View File

@ -43,6 +43,6 @@
#endif
#endif
extern int _sbss_dma_addr, _ebss_dma_addr;
extern int _ebss_pre_dma_addr, _sbss_dma_addr, _ebss_dma_addr;
#endif /* CPU_X86_DMA_H_ */

View File

@ -138,7 +138,9 @@ SYSCALLS_DEFINE_SINGLETON(pci_irq_agent_set_pirq,
offset = 0x3146;
}
value = *(uint16_t*)(rcba_addr + offset);
prot_domains_enable_mmio();
MMIO_READW(value, *(uint16_t ATTR_MMIO_ADDR_SPACE *)(rcba_addr + offset));
/* clear interrupt pin route and set corresponding pirq. */
switch(pin) {
@ -159,7 +161,9 @@ SYSCALLS_DEFINE_SINGLETON(pci_irq_agent_set_pirq,
value |= (pirq << 12);
}
*(uint16_t*)(rcba_addr + offset) = value;
MMIO_WRITEW(*(uint16_t ATTR_MMIO_ADDR_SPACE *)(rcba_addr + offset), value);
prot_domains_disable_mmio();
}
/*---------------------------------------------------------------------------*/
/**
@ -231,7 +235,7 @@ pci_pirq_set_irq(PIRQ pirq, uint8_t irq, uint8_t route_to_legacy)
* \param meta_sz Size of optional driver-defined metadata.
*/
void
pci_init(pci_driver_t *c_this,
pci_init(pci_driver_t ATTR_KERN_ADDR_SPACE *c_this,
pci_config_addr_t pci_addr,
size_t mmio_sz,
uintptr_t meta,

View File

@ -102,7 +102,7 @@ void pci_command_enable(pci_config_addr_t addr, uint32_t flags);
typedef dom_client_data_t pci_driver_t;
void pci_init(pci_driver_t *c_this,
void pci_init(pci_driver_t ATTR_KERN_ADDR_SPACE *c_this,
pci_config_addr_t pci_addr,
size_t mmio_sz,
uintptr_t meta,
@ -113,10 +113,12 @@ void pci_root_complex_init(void);
void pci_root_complex_lock(void);
#define PCI_MMIO_READL(c_this, dest, reg_addr) \
dest = *((volatile uint32_t *) \
(((uintptr_t)PROT_DOMAINS_MMIO(c_this)) + (reg_addr)))
MMIO_READL(dest, \
*((volatile uint32_t ATTR_MMIO_ADDR_SPACE *) \
(((uintptr_t)PROT_DOMAINS_MMIO(c_this)) + (reg_addr))))
#define PCI_MMIO_WRITEL(c_this, reg_addr, src) \
*((volatile uint32_t *) \
(((uintptr_t)PROT_DOMAINS_MMIO(c_this)) + (reg_addr))) = (src)
MMIO_WRITEL(*((volatile uint32_t ATTR_MMIO_ADDR_SPACE *) \
(((uintptr_t)PROT_DOMAINS_MMIO(c_this)) + (reg_addr))), \
src)
#endif /* CPU_X86_DRIVERS_LEGACY_PC_PCI_H_ */

View File

@ -74,6 +74,11 @@ typedef struct uart_16x50_regs {
*/
#define UART_MMIO_SZ MIN_PAGE_SIZE
#else
/* Multi-segment protection domain implementations can control memory with
* byte granularity. Thus, only the registers defined in the uart_16x50_regs
* structure are included in the MMIO region allocated for this protection
* domain:
*/
#define UART_MMIO_SZ sizeof(uart_16x50_regs_t)
#endif
@ -82,24 +87,30 @@ void uart_16x50_setup(uart_16x50_driver_t c_this, uint16_t dl);
/*---------------------------------------------------------------------------*/
SYSCALLS_DEFINE(uart_16x50_setup, uart_16x50_driver_t c_this, uint16_t dl)
{
uart_16x50_regs_t *regs = (uart_16x50_regs_t *)PROT_DOMAINS_MMIO(c_this);
uart_16x50_regs_t ATTR_MMIO_ADDR_SPACE *regs =
(uart_16x50_regs_t ATTR_MMIO_ADDR_SPACE *)PROT_DOMAINS_MMIO(c_this);
prot_domains_enable_mmio();
/* Set the DLAB bit to enable access to divisor settings. */
regs->lcr = UART_LCR_7_DLAB;
MMIO_WRITEL(regs->lcr, UART_LCR_7_DLAB);
/* The divisor settings configure the baud rate, and may need to be defined
* on a per-device basis.
*/
regs->rbr_thr_dll = dl & UINT8_MAX;
regs->ier_dlh = dl >> 8;
MMIO_WRITEL(regs->rbr_thr_dll, dl & UINT8_MAX);
MMIO_WRITEL(regs->ier_dlh, dl >> 8);
/* Clear the DLAB bit to enable access to other settings and configure other
* UART parameters.
*/
regs->lcr = UART_LCR_8BITS;
MMIO_WRITEL(regs->lcr, UART_LCR_8BITS);
/* Enable the FIFOs. */
regs->iir_fcr = UART_FCR_0_FIFOE | UART_FCR_1_RFIFOR | UART_FCR_2_XFIFOR;
MMIO_WRITEL(regs->iir_fcr,
UART_FCR_0_FIFOE | UART_FCR_1_RFIFOR | UART_FCR_2_XFIFOR);
prot_domains_disable_mmio();
}
/*---------------------------------------------------------------------------*/
/**
@ -112,13 +123,21 @@ SYSCALLS_DEFINE(uart_16x50_setup, uart_16x50_driver_t c_this, uint16_t dl)
*/
SYSCALLS_DEFINE(uart_16x50_tx, uart_16x50_driver_t c_this, uint8_t c)
{
uart_16x50_regs_t *regs = (uart_16x50_regs_t *)PROT_DOMAINS_MMIO(c_this);
uint32_t ready;
uart_16x50_regs_t ATTR_MMIO_ADDR_SPACE *regs =
(uart_16x50_regs_t ATTR_MMIO_ADDR_SPACE *)PROT_DOMAINS_MMIO(c_this);
prot_domains_enable_mmio();
/* Wait for space in TX FIFO. */
while((regs->lsr & UART_LSR_5_THRE) == 0);
do {
MMIO_READL(ready, regs->lsr);
} while((ready & UART_LSR_5_THRE) == 0);
/* Add character to TX FIFO. */
regs->rbr_thr_dll = c;
MMIO_WRITEL(regs->rbr_thr_dll, c);
prot_domains_disable_mmio();
}
/*---------------------------------------------------------------------------*/
/**
@ -128,10 +147,12 @@ SYSCALLS_DEFINE(uart_16x50_tx, uart_16x50_driver_t c_this, uint8_t c)
* \param dl Divisor setting to configure the baud rate.
*/
void
uart_16x50_init(uart_16x50_driver_t *c_this,
uart_16x50_init(uart_16x50_driver_t ATTR_KERN_ADDR_SPACE *c_this,
pci_config_addr_t pci_addr,
uint16_t dl)
{
uart_16x50_driver_t loc_c_this;
/* This assumes that the UART had an MMIO range assigned to it by the
* firmware during boot.
*/
@ -141,6 +162,8 @@ uart_16x50_init(uart_16x50_driver_t *c_this,
SYSCALLS_INIT(uart_16x50_tx);
SYSCALLS_AUTHZ(uart_16x50_tx, *c_this);
uart_16x50_setup(*c_this, dl);
prot_domains_copy_dcd(&loc_c_this, c_this);
uart_16x50_setup(loc_c_this, dl);
}
/*---------------------------------------------------------------------------*/

View File

@ -35,7 +35,7 @@
typedef pci_driver_t uart_16x50_driver_t;
void uart_16x50_init(uart_16x50_driver_t *c_this,
void uart_16x50_init(uart_16x50_driver_t ATTR_KERN_ADDR_SPACE *c_this,
pci_config_addr_t pci_addr,
uint16_t dl);

View File

@ -216,13 +216,19 @@ SYSCALLS_DEFINE_SINGLETON(quarkX1000_eth_setup, drv, uintptr_t meta_phys_base)
{
uip_eth_addr mac_addr;
uint32_t mac_tmp1, mac_tmp2;
quarkX1000_eth_meta_t *loc_meta =
(quarkX1000_eth_meta_t *)PROT_DOMAINS_META(drv);
quarkX1000_eth_rx_desc_t rx_desc;
quarkX1000_eth_tx_desc_t tx_desc;
quarkX1000_eth_meta_t ATTR_META_ADDR_SPACE *loc_meta =
(quarkX1000_eth_meta_t ATTR_META_ADDR_SPACE *)PROT_DOMAINS_META(drv);
prot_domains_enable_mmio();
/* Read the MAC address from the device. */
PCI_MMIO_READL(drv, mac_tmp1, REG_ADDR_MACADDR_HI);
PCI_MMIO_READL(drv, mac_tmp2, REG_ADDR_MACADDR_LO);
prot_domains_disable_mmio();
/* Convert the data read from the device into the format expected by
* Contiki.
*/
@ -245,29 +251,39 @@ SYSCALLS_DEFINE_SINGLETON(quarkX1000_eth_setup, drv, uintptr_t meta_phys_base)
uip_setethaddr(mac_addr);
/* Initialize transmit descriptor. */
loc_meta->tx_desc.tdes0 = 0;
loc_meta->tx_desc.tdes1 = 0;
tx_desc.tdes0 = 0;
tx_desc.tdes1 = 0;
loc_meta->tx_desc.buf1_ptr =
(uint8_t *)PROT_DOMAINS_META_OFF_TO_PHYS(
(uintptr_t)&loc_meta->tx_buf, meta_phys_base);
loc_meta->tx_desc.tx_end_of_ring = 1;
loc_meta->tx_desc.first_seg_in_frm = 1;
loc_meta->tx_desc.last_seg_in_frm = 1;
loc_meta->tx_desc.tx_end_of_ring = 1;
tx_desc.tx_end_of_ring = 1;
tx_desc.first_seg_in_frm = 1;
tx_desc.last_seg_in_frm = 1;
tx_desc.tx_end_of_ring = 1;
META_WRITEL(loc_meta->tx_desc.tdes0, tx_desc.tdes0);
META_WRITEL(loc_meta->tx_desc.tdes1, tx_desc.tdes1);
META_WRITEL(loc_meta->tx_desc.buf1_ptr,
(uint8_t *)PROT_DOMAINS_META_OFF_TO_PHYS(
(uintptr_t)&loc_meta->tx_buf, meta_phys_base));
META_WRITEL(loc_meta->tx_desc.buf2_ptr, 0);
/* Initialize receive descriptor. */
loc_meta->rx_desc.rdes0 = 0;
loc_meta->rx_desc.rdes1 = 0;
rx_desc.rdes0 = 0;
rx_desc.rdes1 = 0;
loc_meta->rx_desc.buf1_ptr =
(uint8_t *)PROT_DOMAINS_META_OFF_TO_PHYS(
(uintptr_t)&loc_meta->rx_buf, meta_phys_base);
loc_meta->rx_desc.own = 1;
loc_meta->rx_desc.first_desc = 1;
loc_meta->rx_desc.last_desc = 1;
loc_meta->rx_desc.rx_buf1_sz = UIP_BUFSIZE;
loc_meta->rx_desc.rx_end_of_ring = 1;
rx_desc.own = 1;
rx_desc.first_desc = 1;
rx_desc.last_desc = 1;
rx_desc.rx_buf1_sz = UIP_BUFSIZE;
rx_desc.rx_end_of_ring = 1;
META_WRITEL(loc_meta->rx_desc.rdes0, rx_desc.rdes0);
META_WRITEL(loc_meta->rx_desc.rdes1, rx_desc.rdes1);
META_WRITEL(loc_meta->rx_desc.buf1_ptr,
(uint8_t *)PROT_DOMAINS_META_OFF_TO_PHYS(
(uintptr_t)&loc_meta->rx_buf, meta_phys_base));
META_WRITEL(loc_meta->rx_desc.buf2_ptr, 0);
prot_domains_enable_mmio();
/* Install transmit and receive descriptors. */
PCI_MMIO_WRITEL(drv, REG_ADDR_RX_DESC_LIST,
@ -298,8 +314,11 @@ SYSCALLS_DEFINE_SINGLETON(quarkX1000_eth_setup, drv, uintptr_t meta_phys_base)
/* Place the receiver state machine in the Running state. */
OP_MODE_1_START_RX);
prot_domains_disable_mmio();
printf(LOG_PFX "Enabled 100M full-duplex mode.\n");
}
/*---------------------------------------------------------------------------*/
/**
* \brief Poll for a received Ethernet frame.
@ -313,33 +332,43 @@ SYSCALLS_DEFINE_SINGLETON(quarkX1000_eth_poll, drv, uint16_t * frame_len)
{
uint16_t *loc_frame_len;
uint16_t frm_len = 0;
quarkX1000_eth_meta_t *loc_meta =
(quarkX1000_eth_meta_t *)PROT_DOMAINS_META(drv);
quarkX1000_eth_rx_desc_t tmp_desc;
quarkX1000_eth_meta_t ATTR_META_ADDR_SPACE *loc_meta =
(quarkX1000_eth_meta_t ATTR_META_ADDR_SPACE *)PROT_DOMAINS_META(drv);
PROT_DOMAINS_VALIDATE_PTR(loc_frame_len, frame_len, sizeof(*frame_len));
META_READL(tmp_desc.rdes0, loc_meta->rx_desc.rdes0);
/* Check whether the RX descriptor is still owned by the device. If not,
* process the received frame or an error that may have occurred.
*/
if(loc_meta->rx_desc.own == 0) {
if(loc_meta->rx_desc.err_summary) {
if(tmp_desc.own == 0) {
META_READL(tmp_desc.rdes1, loc_meta->rx_desc.rdes1);
if(tmp_desc.err_summary) {
fprintf(stderr,
LOG_PFX "Error receiving frame: RDES0 = %08x, RDES1 = %08x.\n",
loc_meta->rx_desc.rdes0, loc_meta->rx_desc.rdes1);
tmp_desc.rdes0, tmp_desc.rdes1);
assert(0);
}
frm_len = loc_meta->rx_desc.frm_len;
frm_len = tmp_desc.frm_len;
assert(frm_len <= UIP_BUFSIZE);
memcpy(uip_buf, (void *)loc_meta->rx_buf, frm_len);
MEMCPY_FROM_META(uip_buf, loc_meta->rx_buf, frm_len);
/* Return ownership of the RX descriptor to the device. */
loc_meta->rx_desc.own = 1;
tmp_desc.own = 1;
META_WRITEL(loc_meta->rx_desc.rdes0, tmp_desc.rdes0);
prot_domains_enable_mmio();
/* Request that the device check for an available RX descriptor, since
* ownership of the descriptor was just transferred to the device.
*/
PCI_MMIO_WRITEL(drv, REG_ADDR_RX_POLL_DEMAND, 1);
prot_domains_disable_mmio();
}
*loc_frame_len = frm_len;
@ -356,32 +385,45 @@ SYSCALLS_DEFINE_SINGLETON(quarkX1000_eth_poll, drv, uint16_t * frame_len)
*/
SYSCALLS_DEFINE_SINGLETON(quarkX1000_eth_send, drv)
{
quarkX1000_eth_meta_t *loc_meta =
(quarkX1000_eth_meta_t *)PROT_DOMAINS_META(drv);
quarkX1000_eth_tx_desc_t tmp_desc;
quarkX1000_eth_meta_t ATTR_META_ADDR_SPACE *loc_meta =
(quarkX1000_eth_meta_t ATTR_META_ADDR_SPACE *)PROT_DOMAINS_META(drv);
/* Wait until the TX descriptor is no longer owned by the device. */
while(loc_meta->tx_desc.own == 1);
do {
META_READL(tmp_desc.tdes0, loc_meta->tx_desc.tdes0);
} while(tmp_desc.own == 1);
META_READL(tmp_desc.tdes1, loc_meta->tx_desc.tdes1);
/* Check whether an error occurred transmitting the previous frame. */
if(loc_meta->tx_desc.err_summary) {
if(tmp_desc.err_summary) {
fprintf(stderr,
LOG_PFX "Error transmitting frame: TDES0 = %08x, TDES1 = %08x.\n",
loc_meta->tx_desc.tdes0, loc_meta->tx_desc.tdes1);
tmp_desc.tdes0, tmp_desc.tdes1);
assert(0);
}
/* Transmit the next frame. */
assert(uip_len <= UIP_BUFSIZE);
memcpy((void *)loc_meta->tx_buf, uip_buf, uip_len);
MEMCPY_TO_META(loc_meta->tx_buf, uip_buf, uip_len);
loc_meta->tx_desc.tx_buf1_sz = uip_len;
tmp_desc.tx_buf1_sz = uip_len;
loc_meta->tx_desc.own = 1;
META_WRITEL(loc_meta->tx_desc.tdes1, tmp_desc.tdes1);
tmp_desc.own = 1;
META_WRITEL(loc_meta->tx_desc.tdes0, tmp_desc.tdes0);
prot_domains_enable_mmio();
/* Request that the device check for an available TX descriptor, since
* ownership of the descriptor was just transferred to the device.
*/
PCI_MMIO_WRITEL(drv, REG_ADDR_TX_POLL_DEMAND, 1);
prot_domains_disable_mmio();
}
/*---------------------------------------------------------------------------*/
/**

View File

@ -56,7 +56,11 @@
#define HIGHEST_REG LS_SYNC
#if X86_CONF_PROT_DOMAINS == X86_CONF_PROT_DOMAINS__PAGING
#define MMIO_SZ MIN_PAGE_SIZE
#else
#define MMIO_SZ (HIGHEST_REG + 4)
#endif
PROT_DOMAINS_ALLOC(pci_driver_t, drv);
@ -77,7 +81,9 @@ SYSCALLS_DEFINE_SINGLETON(quarkX1000_gpio_mmin, drv,
halt();
}
prot_domains_enable_mmio();
PCI_MMIO_READL(drv, *loc_res, offset);
prot_domains_disable_mmio();
}
static inline uint32_t
@ -96,7 +102,9 @@ SYSCALLS_DEFINE_SINGLETON(quarkX1000_gpio_mmout, drv,
halt();
}
prot_domains_enable_mmio();
PCI_MMIO_WRITEL(drv, offset, val);
prot_domains_disable_mmio();
}
static inline void

View File

@ -51,7 +51,11 @@
#define I2C_IRQ 9
#if X86_CONF_PROT_DOMAINS == X86_CONF_PROT_DOMAINS__PAGING
#define MMIO_SZ MIN_PAGE_SIZE
#else
#define MMIO_SZ (QUARKX1000_IC_HIGHEST + 4)
#endif
typedef enum {
I2C_DIRECTION_READ,
@ -99,7 +103,9 @@ SYSCALLS_DEFINE_SINGLETON(quarkX1000_i2c_mmin, drv,
halt();
}
prot_domains_enable_mmio();
PCI_MMIO_READL(drv, *loc_res, offset);
prot_domains_disable_mmio();
}
static inline uint32_t
@ -119,7 +125,9 @@ SYSCALLS_DEFINE_SINGLETON(quarkX1000_i2c_mmout, drv,
halt();
}
prot_domains_enable_mmio();
PCI_MMIO_WRITEL(drv, offset, val);
prot_domains_disable_mmio();
}
static inline void

View File

@ -49,7 +49,7 @@ void
quarkX1000_uart_init(quarkX1000_uart_dev_t dev)
{
pci_config_addr_t pci_addr;
uart_16x50_driver_t *drv;
uart_16x50_driver_t ATTR_KERN_ADDR_SPACE *drv;
assert((dev == QUARK_X1000_UART_0) || (dev == QUARK_X1000_UART_1));
@ -78,7 +78,11 @@ quarkX1000_uart_init(quarkX1000_uart_dev_t dev)
void
quarkX1000_uart_tx(quarkX1000_uart_dev_t dev, uint8_t c)
{
uart_16x50_driver_t drv;
assert((dev == QUARK_X1000_UART_0) || (dev == QUARK_X1000_UART_1));
uart_16x50_tx((dev == QUARK_X1000_UART_0) ? quarkX1000_uart0 : quarkX1000_uart1, c);
prot_domains_copy_dcd(&drv,
(dev == QUARK_X1000_UART_0) ?
&quarkX1000_uart0 : &quarkX1000_uart1);
uart_16x50_tx(drv, c);
}
/*---------------------------------------------------------------------------*/

View File

@ -42,8 +42,11 @@ double_fault_handler(struct interrupt_context context)
halt();
}
/*---------------------------------------------------------------------------*/
/* The OS has switched to its own segment descriptors. However, the protection
* domain support, if enabled, has not yet been fully activated.
/* The OS has switched to its own segment descriptors. When multi-segment
* protection domain support is enabled, this routine runs with the
* necessary address translations configured to invoke other routines that
* require those translations to be in place. However, the protection domain
* support, if enabled, has not yet been fully activated.
*/
static void
boot_stage1(void)
@ -75,7 +78,8 @@ cpu_boot_stage0(void)
uintptr_t top_of_stack = STACKS_INIT_TOP;
#if X86_CONF_PROT_DOMAINS != X86_CONF_PROT_DOMAINS__NONE
uintptr_t *top_of_stack_ptr = (uintptr_t *)top_of_stack;
uintptr_t *top_of_stack_ptr =
(uintptr_t *)DATA_OFF_TO_PHYS_ADDR(top_of_stack);
top_of_stack_ptr[0] = (uintptr_t)prot_domains_launch_kernel;
top_of_stack_ptr[1] = (uintptr_t)prot_domains_launch_app;

View File

@ -72,7 +72,7 @@ set_descriptor(unsigned int index,
segment_desc_init(&descriptor, base, len, flag);
/* Save descriptor into gdt */
gdt[index] = descriptor;
gdt_insert_boot(index, descriptor);
}
/*---------------------------------------------------------------------------*/
void
@ -86,15 +86,17 @@ gdt_copy_desc_change_dpl(unsigned int dest_idx,
halt();
}
desc = gdt[src_idx];
gdt_lookup(src_idx, &desc);
SEG_SET_FLAG(desc, DPL, dpl);
gdt[dest_idx] = desc;
gdt_insert(dest_idx, desc);
}
/*---------------------------------------------------------------------------*/
/* This function initializes the Global Descriptor Table. For simplicity, the
* memory is organized following the flat model. Thus, memory appears to
* Contiki as a single continuous address space. Code, data, and stack
* memory is initially organized following the flat model. Thus, memory appears
* to Contiki as a single continuous address space. Code, data, and stack
* are all contained in this address space (so called linear address space).
* Certain protection domain implementations switch to a multi-segment memory
* model later during boot.
*/
void
gdt_init(void)
@ -103,7 +105,7 @@ gdt_init(void)
/* Initialize gdtr structure */
gdtr.limit = sizeof(segment_desc_t) * GDT_LEN - 1;
gdtr.base = (uint32_t) &gdt;
gdtr.base = KERN_DATA_OFF_TO_PHYS_ADDR(gdt);
/* Initialize descriptors */
set_descriptor(GDT_IDX_NULL, 0, 0, 0);
@ -115,13 +117,20 @@ gdt_init(void)
}
/*---------------------------------------------------------------------------*/
void
gdt_insert_boot(unsigned int idx, segment_desc_t desc)
{
((segment_desc_t *)KERN_DATA_OFF_TO_PHYS_ADDR(gdt))[idx] = desc;
}
/*---------------------------------------------------------------------------*/
void
gdt_insert(unsigned int idx, segment_desc_t desc)
{
if(GDT_LEN <= idx) {
halt();
}
gdt[idx] = desc;
KERN_WRITEL(gdt[idx].raw_lo, desc.raw_lo);
KERN_WRITEL(gdt[idx].raw_hi, desc.raw_hi);
}
/*---------------------------------------------------------------------------*/
void
@ -131,6 +140,7 @@ gdt_lookup(unsigned int idx, segment_desc_t *desc)
halt();
}
*desc = gdt[idx];
KERN_READL(desc->raw_lo, gdt[idx].raw_lo);
KERN_READL(desc->raw_hi, gdt[idx].raw_hi);
}
/*---------------------------------------------------------------------------*/

View File

@ -35,13 +35,21 @@
#include "prot-domains.h"
#include "segmentation.h"
extern segment_desc_t gdt[];
extern int _ebss_gdt_addr;
extern segment_desc_t ATTR_KERN_ADDR_SPACE gdt[];
extern int ATTR_KERN_ADDR_SPACE _ebss_gdt_addr;
#define GDT_IDX_OF_DESC(ptr) \
((((uintptr_t)(ptr)) - ((uintptr_t)&gdt))/ \
sizeof(segment_desc_t))
typedef struct far_pointer {
/** Far pointer offset. */
uint32_t offset;
/** Far pointer segment/gate selector. */
uint16_t sel;
uint16_t pad;
} __attribute__((packed)) far_pointer_t;
/**
* \brief Compute the selector for a GDT entry allocated somewhere besides gdt.c.
* \param ptr Pointer to GDT descriptor.
@ -49,14 +57,22 @@ extern int _ebss_gdt_addr;
*/
#define GDT_SEL_OF_DESC(ptr, rpl) GDT_SEL(GDT_IDX_OF_DESC(ptr), rpl)
#define ATTR_BSS_GDT __attribute__((section(".gdt_bss")))
#define ATTR_BSS_GDT_START __attribute__((section(".gdt_bss_start")))
/* Section for fixed GDT entries */
#define ATTR_BSS_GDT \
__attribute__((section(".gdt_bss"))) ATTR_KERN_ADDR_SPACE
/* Section for TSS and LDT descriptors for protection domains */
#define ATTR_BSS_GDT_MID \
__attribute__((used, section(".gdt_bss_mid"))) ATTR_KERN_ADDR_SPACE
/* Section for other GDT entries */
#define ATTR_BSS_GDT_START \
__attribute__((section(".gdt_bss_start"))) ATTR_KERN_ADDR_SPACE
void gdt_copy_desc_change_dpl(unsigned int dest_idx,
unsigned int src_idx,
unsigned dpl);
void gdt_init(void) ATTR_CODE_BOOT;
void gdt_insert(unsigned int idx, segment_desc_t desc);
void gdt_insert_boot(unsigned int idx, segment_desc_t desc) ATTR_CODE_BOOT;
void gdt_lookup(unsigned int idx, segment_desc_t *desc);
#endif /* GDT_H */

View File

@ -43,17 +43,23 @@ typedef struct idtr {
uint32_t base;
} __attribute__((packed)) idtr_t;
typedef struct intr_gate_desc {
uint16_t offset_low;
uint16_t selector; /* Segment Selector for destination code segment */
uint16_t fixed:11;
uint16_t d:1; /* Size of gate: 1 = 32 bits; 0 = 16 bits */
uint16_t pad:1;
uint16_t dpl:2; /* Descriptor Privilege Level */
uint16_t p:1; /* Segment Present flag */
uint16_t offset_high;
} __attribute__((packed)) intr_gate_desc_t;
typedef union intr_gate_desc {
struct __attribute__((packed)) {
uint16_t offset_low;
uint16_t selector; /* Segment Selector for destination code segment */
uint16_t fixed:11;
uint16_t d:1; /* Size of gate: 1 = 32 bits; 0 = 16 bits */
uint16_t pad:1;
uint16_t dpl:2; /* Descriptor Privilege Level */
uint16_t p:1; /* Segment Present flag */
uint16_t offset_high;
};
uint64_t raw;
struct {
uint32_t raw_lo;
uint32_t raw_hi;
};
} intr_gate_desc_t;
/* According to Intel Combined Manual, Vol. 3, Section 6.10, the base addresses
* of the IDT should be aligned on an 8-byte boundary to maximize performance
@ -73,15 +79,19 @@ idt_set_intr_gate_desc(int intr_num,
uint16_t cs,
uint16_t dpl)
{
intr_gate_desc_t *desc = &idt[intr_num];
intr_gate_desc_t desc;
desc->offset_low = offset & 0xFFFF;
desc->selector = cs;
desc->fixed = BIT(9) | BIT(10);
desc->d = 1;
desc->dpl = dpl;
desc->p = 1;
desc->offset_high = (offset >> 16) & 0xFFFF;
desc.offset_low = offset & 0xFFFF;
desc.selector = cs;
desc.fixed = BIT(9) | BIT(10);
desc.pad = 0;
desc.d = 1;
desc.dpl = dpl;
desc.p = 1;
desc.offset_high = (offset >> 16) & 0xFFFF;
KERN_WRITEL(idt[intr_num].raw_hi, desc.raw_hi);
KERN_WRITEL(idt[intr_num].raw_lo, desc.raw_lo);
}
/*---------------------------------------------------------------------------*/
/* Initialize Interrupt Descriptor Table. The IDT is initialized with
@ -95,7 +105,7 @@ idt_init(void)
/* Initialize idtr structure */
idtr.limit = (sizeof(intr_gate_desc_t) * NUM_DESC) - 1;
idtr.base = (uint32_t)&idt;
idtr.base = KERN_DATA_OFF_TO_PHYS_ADDR((uint32_t)idt);
/* Load IDTR register */
__asm__("lidt %0\n\t" :: "m" (idtr));

View File

@ -34,7 +34,7 @@
#include <stdint.h>
#include "prot-domains.h"
void idt_init(void) ATTR_CODE_BOOT;
void idt_init(void);
void idt_set_intr_gate_desc(int intr_num,
uint32_t offset,
uint16_t cs,

View File

@ -5,13 +5,15 @@ Introduction
------------
The X86 port of Contiki implements a simple, lightweight form of
protection domains using a pluggable framework. Currently, the
following plugin is available:
protection domains using a pluggable framework. Currently, there are
two plugins available:
- Flat memory model with paging.
- Multi-segment memory model with hardware-switched segments based on
Task-State Segment (TSS) structures.
For an introduction to paging and possible ways in which it can be
used, refer to the following resources:
For an introduction to paging and TSS and possible ways in which they
can be used, refer to the following resources:
- Intel Combined Manual (Intel 64 and IA-32 Architectures Software
Developer's Manual), Vol. 3, Chapter 4
@ -28,7 +30,7 @@ idealized principle is balanced against the practical objectives of
limiting the number of relatively time-consuming context switches and
minimizing changes to existing code. In fact, no changes were made to
code outside of the CPU- and platform-specific code directories for
the initial plugin.
the initial plugins.
Each protection domain can optionally be associated with a metadata
and/or MMIO region. The hardware can support additional regions per
@ -139,7 +141,11 @@ the one that was interrupted. However, interrupts are only actually
enabled in the application protection domain.
Similarly, register contents may be accessed and modified across
protection domain boundaries.
protection domain boundaries in some protection domain
implementations. The TSS task switching mechanism automatically saves
and restores many registers to and from TSS data structures when
switching tasks, but the paging-based protection domain implementation
does not perform analogous operations.
For the reasons described above, each protection domain should only
invoke other protection domains that it trusts to properly handle data
@ -186,7 +192,9 @@ disabled. Flat segments each map the whole 4GiB physical memory
space. This is the state of the system when the OS enters boot stage
0. This stage is responsible for setting up a new GDT and loading the
segment registers with the appropriate descriptors from the new GDT to
enable boot stage 1 to run.
enable boot stage 1 to run. Code in stage 1 for multi-segment
protection domain implementations require that the appropriate
segment-based address translations be configured.
#### Boot Stage 1
@ -258,17 +266,18 @@ Ring level 1 is unused.
### IO and Interrupt Privileges
The kernel protection domain cooperative scheduling context needs
access to IO ports, for device initialization. Other protection
domains may also require such access. The IO Privilege Level (IOPL)
that is assigned to a protection domain using the relevant bits in the
access to IO ports, for device initialization. Some other protection
domains also require such access. The IO Privilege Level (IOPL) that
is assigned to a protection domain using the relevant bits in the
EFLAGS field could be set according to whether IO port access is
required in that protection domain. However, this would introduce
additional complexity and overhead in the critical system call and
return dispatchers. Instead, the IOPL is always set to block IO
access from the cooperative scheduling context. Port IO instructions
in that context will then generate general protection faults, and the
exception handler decodes and emulates authorized port IO
instructions.
required in that protection domain. This is straightforward for TSS,
which includes separate flags settings for each protection domain.
However, this would introduce additional complexity and overhead in
the critical system call and return dispatchers for other plugins.
Instead, the IOPL is always set to block IO access from the
cooperative scheduling context. Port IO instructions in that context
will then generate general protection faults, and the exception
handler decodes and emulates authorized port IO instructions.
Interrupts are handled at ring level 2, since they do not use any
privileged instructions. They do cause the interrupt flag to be
@ -307,11 +316,15 @@ pivoting to the main stack and executing the handler.
### Protection Domain Control Structures (PDCSes)
Each protection domain is managed by the kernel and privileged
functions using a PDCS. The PDCS structure is entirely
software-defined. The initial protection domain plugin does not
support re-entrant protection domains to simplify the implementation
of the plugin by enabling domain-specific information (e.g. system
call return address) to be trivially stored in each PDCS.
functions using a PDCS. The structure of the PDCS is partially
hardware-imposed in the cases of the two segment-based plugins, since
the PDCS contains the Local Descriptor Table (LDT) and the TSS, if
applicable. In the paging plugin, the PDCS structure is entirely
software-defined. None of the initial protection domain plugins
support re-entrant protection domains due to hardware-imposed
limitations of TSS and to simplify the implementation of the other
plugins by enabling domain-specific information (e.g. system call
return address) to be trivially stored in each PDCS.
### Paging-Based Protection Domains
@ -547,6 +560,293 @@ be possible to improve the robustness of the system by marking that
data as read-only. Doing so would introduce additional complexity
into the system.
### Hardware-Switched Segment-Based Protection Domains
Primary implementation sources:
- cpu/x86/mm/tss-prot-domains.c
- cpu/x86/mm/tss-prot-domains-asm.S
#### Introduction
One TSS is allocated for each protection domain. Each one is
associated with its own dedicated LDT. The memory resources assigned
to each protection domain are represented as segment descriptors in
the LDT for the protection domain. Additional shared memory resources
are represented as segment descriptors in the GDT.
#### System Call and Return Dispatching
The system call dispatcher runs in the context of the server
protection domain. It is a common piece of code that is shared among
all protection domains. Thus, each TSS, except the application TSS,
has its EIP field initialized to the entrypoint for the system call
dispatcher so that will be the first code to run when the first switch
to that task is performed.
The overall process of handling a system call can be illustrated at a
high level as follows. Some minor steps are omitted from this
illustration in the interest of clarity and brevity.
```
== BEGIN Client protection domain ==========================================
-- BEGIN Caller ------------------------------------------------------------
1. Call system call stub.
--
13. Continue execution...
-- END Caller --------------------------------------------------------------
-- BEGIN System call stub --------------------------------------------------
2. Already in desired (server) protection domain?
- No: Request task switch to server protection domain.
- Yes: Jump to system call body.
--
12. Return to caller.
-- END System call stub ----------------------------------------------------
== END Client protection domain ============================================
== BEGIN Server protection domain ==========================================
-- BEGIN System call dispatcher---------------------------------------------
3. Check that the requested system call is allowed. Get entrypoint.
4. Switch to the main stack.
5. Pop the client return address off the stack to a callee-saved register.
6. Push the address of the system call return dispatcher onto the stack.
7. Jump to system call body.
--
10. Restore the client return address to the stack.
11. Request task switch to client protection domain.
-- END System call dispatcher ----------------------------------------------
-- BEGIN System call body --------------------------------------------------
8. Execute the work for the requested system call.
9. Return (to system call return stub, unless invoked from server
protection domain, in which case return is to caller).
-- END System call body ----------------------------------------------------
== END Server protection domain ============================================
```
An additional exception handler is needed, for the "Device Not
Available" exception. The handler comprises just a CLTS and an IRET
instruction. The CLTS instruction is privileged, which is why it must
be run at ring level 0. This exception handler is invoked when a
floating point instruction is used following a task switch, and its
sole purpose is to enable the floating point instruction to execute
after the exception handler returns. See the TSS resources listed
above for more details regarding interactions between task switching
and floating point instructions.
Each segment register may represent a different data region within
each protection domain, although the FS register is used for two
separate purposes at different times. The segments are defined as
follows:
- CS (code segment) maps all non-startup code with execute-only
permissions in all protection domains. Limiting the code that is
executable within each protection domain to just the code that is
actually needed within that protection domain could improve the
robustness of the system, but it is challenging to determine all
code that may be needed in a given protection domain (e.g. all
needed library routines). Furthermore, that code may not all be
contiguous, and each segment descriptor can only map a contiguous
memory region. Finally, segment-based memory addressing is
relative to an offset of zero from the beginning of each segment,
introducing additional complexity if such fine-grained memory
management were to be used.
- DS (default data segment) typically maps the main stack and all
non-stack data memory that is accessible from all protection
domains. Limiting the data that is accessible via DS within each
protection domain to just the subset of the data that is actually
needed within that protection domain could improve the robustness
of the system, but it is challenging for similar reasons to those
that apply to CS. Access to the main stack via DS is supported so
that code that copies the stack pointer to a register and attempts
to access stack entries via DS works correctly. Disallowing access
to the main stack via DS could improve the robustness of the
system, but that may require modifying code that expects to be able
to access the stack via DS.
- ES is loaded with the same segment descriptor as DS so that string
operations (e.g. the MOVS instruction) work correctly.
- FS usually maps the kernel-owned data region. That region can only
be written via FS in the kernel protection domain. FS contains a
descriptor specifying a read-only mapping in all other protection
domains except the application protection domain, in which FS is
nullified. Requiring that code specifically request access to the
kernel-owned data region by using the FS segment may improve the
robustness of the system by blocking undesired accesses to the
kernel-owned data region via memory access instructions within the
kernel protection domain that implicitly access DS. The reason for
granting read-only access to the kernel-owned data region from most
protection domains is that the system call dispatcher runs in the
context of the server protection domain to minimize overhead, and
it requires access to the kernel-owned data region. It may improve
the robustness of the system to avoid this by running the system
call dispatcher in a more-privileged ring level (e.g. ring 1)
within the protection domain and just granting access to the
kernel-owned data region from that ring. However, that would
necessitate a ring level transition to ring 3 when dispatching the
system call, which would increase overhead. The application
protection domain does not export any system calls, so it does not
require access to the kernel-owned data region.
- FS is temporarily loaded with a segment descriptor that maps just
an MMIO region used by a driver protection domain when such a
driver needs to perform MMIO accesses.
- GS maps an optional region of readable and writable metadata that
can be associated with a protection domain. In protection domains
that are not associated with metadata, GS is nullified.
- SS usually maps just the main stack. This may improve the
robustness of the system by enabling immediate detection of stack
underflows and overflows rather than allowing such a condition to
result in silent data corruption. Interrupt handlers use a stack
segment that covers the main stack and also includes a region above
the main stack that is specifically for use by interrupt handlers.
In like manner, exception handlers use a stack segment that covers
both of the other stacks and includes an additional region. This
is to support the interrupt dispatchers that copy parameters from
the interrupt-specific stack region to the main stack prior to
pivoting to the main stack to execute an interrupt handler body.
The approximate memory layout of the system is depicted below,
starting with the highest physical addresses and proceeding to lower
physical addresses. The memory ranges that are mapped at various
times by each of the segment registers are also depicted. Read the
descriptions of each segment above for more information about what
memory range may be mapped by each segment register at various times
with various protection domain configurations. Parenthetical notes
indicate the protection domains that can use each mapping. The suffix
[L] indicates that the descriptor is loaded from LDT. Optional
mappings are denoted by a '?' after the protection domain label. The
'other' protection domain label refers to protection domains other
than the application and kernel domains.
```
...
+------------------------------------------+ \
| Domain X MMIO | +- FS[L]
+------------------------------------------+ / (other?)
...
+------------------------------------------+ \
| Domain X DMA-accessible metadata | +- GS[L] (other?)
| (section .dma_bss) | |
+------------------------------------------+ /
+------------------------------------------+ \
| Domain X metadata (section .meta_bss) | +- GS[L] (other?)
+------------------------------------------+ /
...
+------------------------------------------+ \
| Kernel-private data | |
| (sections .prot_dom_bss, .gdt_bss, etc.) | +- FS[L] (kern)
+------------------------------------------+ |
+------------------------------------------+ \
| System call data (section .syscall_bss) | |
+------------------------------------------+ +- FS[L] (all)
+------------------------------------------+ |
| Kernel-owned data (section .kern_bss) | |
+------------------------------------------+ /
+------------------------------------------+ \
| Common data | |
| (sections .data, .rodata*, .bss, etc.) | |
+------------------------------------------+ +- DS, ES
+------------------------------------------+ \ | (all)
| Exception stack (section .exc_stack) | | |
|+----------------------------------------+| \ |
|| Interrupt stack (section .int_stack) || | |
||+--------------------------------------+|| \ |
||| Main stack (section .main_stack) ||| +- SS (all) |
+++--------------------------------------+++ / /
+------------------------------------------+ \
| Main code (.text) | +- CS (all)
+------------------------------------------+ /
+------------------------------------------+
| Bootstrap code (section .boot_text) |
+------------------------------------------+
+------------------------------------------+
| Multiboot header |
+------------------------------------------+
...
```
This memory layout is more efficient than the layout that is possible
with paging-based protection domains, since segments have byte
granularity, whereas the minimum unit of control supported by paging
is a 4KiB page. For example, this means that metadata may need to be
padded to be a multiple of the page size. This may also permit
potentially-undesirable accesses to padded areas of code and data
regions that do not entirely fill the pages that they occupy.
Kernel data structure access, including to the descriptor tables
themselves, is normally restricted to the code running at ring level
0, specifically the exception handlers and the system call and return
dispatchers. It is also accessible from the cooperative scheduling
context in the kernel protection domain. Interrupt delivery is
disabled in the kernel protection domain, so the preemptive scheduling
context is not used.
SS, DS, and ES all have the same base address, since the compiler may
assume that a flat memory model is in use. Memory accesses that use a
base register of SP/ESP or BP/EBP or that are generated by certain
other instructions (e.g. PUSH, RET, etc.) are directed to SS by
default, whereas other accesses are directed to DS or ES by default.
The compiler may use an instruction that directs an access to DS or ES
even if the data being accessed is on the stack, which is why these
three segments must use the same base address. However, it is
possible to use a lower limit for SS than for DS and ES for the
following reasons. Compilers commonly provide an option for
preventing the frame pointer, EBP, from being omitted and possibly
used to point to non-stack data. In our tests, compilers never used
ESP to point to non-stack data.
Each task switch ends up saving and restoring more state than is
actually useful to us, but the implementation attempts to minimize
overhead by configuring the register values in each TSS to reduce the
number of register loads that are needed in the system call
dispatcher. Specifically, two callee-saved registers are populated
with base addresses used when computing addresses in the entrypoint
information table as well as a mask corresponding to the ID of the
server protection domain that is used to check whether the requested
system call is exported by the server protection domain. Callee-saved
registers are used, since the task return will update the saved
register values.
Note that this implies that the intervening code run between the task
call and return can modify critical data used by the system call
dispatcher. However, this is analogous to the considerations
associated with sharing a single stack amongst all protection domains
and should be addressed similarly, by only invoking protection domains
that are trusted by the caller to not modify the saved critical
values. This consideration is specific to the TSS-based dispatcher
and is not shared by the ring 0 dispatcher used in the other
plugins.
Data in the .rodata sections is marked read/write, even though it may
be possible to improve the robustness of the system by marking that
data as read-only. Doing so would introduce even more complexity into
the system than would be the case with paging-based protection
domains, since it would require allocating different segment
descriptors for the read-only vs. the read/write data.
#### Supporting Null-Pointer Checks
A lot of code considers a pointer value of 0 to be invalid. However,
segment offsets always start at 0. To accommodate the common software
behavior, at least the first byte of each segment is marked as
unusable. An exception to this is that the first byte of the stack
segments is usable.
#### Interrupt and Exception Dispatching
A distinctive challenge that occurs during interrupt and exception
dispatching is that the state of the segment registers when an
interrupt or exception occurs is somewhat unpredictable. For example,
an exception may occur while MMIO is being performed, meaning that FS
is loaded with the MMIO descriptor instead of the kernel descriptor.
Leaving the segment registers configured in that way could cause
incorrect interrupt or exception handler behavior. Thus, the
interrupt or exception dispatcher must save the current segment
configuration, switch to a configuration that is suitable for the
handler body, and then restore the saved segment configuration after
the handler body returns. Another motivation for this is that the
interrupted code may have corrupted the segment register configuration
in an unexpected manner, since segment register load instructions are
unprivileged. Similar segment register updates must be performed for
similar reasons when dispatching system calls.
### Pointer Validation
Primary implementation sources:
@ -563,10 +863,14 @@ an unintended manner. For example, if an incoming pointer referenced
the return address, it could potentially redirect execution with the
privileges of the callee protection domain.
It is also necessary to check that the pointer is either within the
stack region or the shared data region (or a guard band region, since
that will generate a fault) to prevent redirection of data accesses to
MMIO or metadata regions.
When the paging-based plugin is in use, it is also necessary to check
that the pointer is either within the stack region or the shared data
region (or a guard band region, since that will generate a fault) to
prevent redirection of data accesses to MMIO or metadata regions. The
other plugins already configure segments to restrict accesses to DS to
just those regions. Pointers provided as inputs to system calls as
defined above should never be dereferenced in any segment other than
DS.
The pointer is both validated and copied to a new storage location,
which must be within the callee's local stack region (excluding the
@ -648,8 +952,11 @@ The following steps are required:
Usage
-----
To enable protection domain support, add
"X86_CONF_PROT_DOMAINS=paging" to the command line.
To enable protection domain support, add "X86_CONF_PROT_DOMAINS=" to
the command line and specify one of the following options:
- paging
- tss
The paging option accepts a sub-option to determine whether the TLB is
fully- or selectively-invalidated during protection domain switches.

View File

@ -39,6 +39,8 @@
* outside of gdt.c.
*/
#define GDT_NUM_FIXED_DESC 7
#elif X86_CONF_PROT_DOMAINS_MULTI_SEG
#define GDT_NUM_FIXED_DESC 11
#else
#define GDT_NUM_FIXED_DESC 3
#endif
@ -66,12 +68,34 @@
/** Stack segment for interrupt handlers */
#define GDT_IDX_STK_INT 5
#if X86_CONF_PROT_DOMAINS == X86_CONF_PROT_DOMAINS__PAGING
#define GDT_IDX_CODE_EXC GDT_IDX_CODE_FLAT
/** Default data segment used by code at all privilege levels */
#define GDT_IDX_DATA 6
#define GDT_IDX_STK GDT_IDX_DATA
#define GDT_IDX_STK_EXC GDT_IDX_DATA_FLAT
#else
/**
* Same bounds and permissions as default code segment, but at the exception
* handler privilege level
*/
#define GDT_IDX_CODE_EXC 6
/** R/W kernel data descriptor used during boot stage 1 */
#define GDT_IDX_DATA_KERN_EXC 7
/** Default data segment used by code at all privilege levels */
#define GDT_IDX_DATA 8
/**
* Default stack segment, which overlaps with the beginning of the default data
* segment
*/
#define GDT_IDX_STK 9
/** Stack segment for exception handlers */
#define GDT_IDX_STK_EXC 10
#define GDT_IDX_TSS(dom_id) (GDT_NUM_FIXED_DESC + (2 * (dom_id)))
#define GDT_IDX_LDT(dom_id) (GDT_NUM_FIXED_DESC + (2 * (dom_id)) + 1)
#endif
#else
#define GDT_IDX_CODE GDT_IDX_CODE_FLAT
#define GDT_IDX_CODE_INT GDT_IDX_CODE_FLAT
#define GDT_IDX_CODE_EXC GDT_IDX_CODE_FLAT
@ -96,10 +120,14 @@
#define GDT_SEL_CODE_EXC GDT_SEL(GDT_IDX_CODE_EXC, PRIV_LVL_EXC)
#define GDT_SEL_DATA GDT_SEL(GDT_IDX_DATA, PRIV_LVL_EXC)
#define GDT_SEL_DATA_KERN_EXC GDT_SEL(GDT_IDX_DATA_KERN_EXC, PRIV_LVL_EXC)
#define GDT_SEL_STK GDT_SEL(GDT_IDX_STK, PRIV_LVL_USER)
#define GDT_SEL_STK_INT GDT_SEL(GDT_IDX_STK_INT, PRIV_LVL_INT)
#define GDT_SEL_STK_EXC GDT_SEL(GDT_IDX_STK_EXC, PRIV_LVL_EXC)
#define GDT_SEL_TSS(dom_id) GDT_SEL(GDT_IDX_TSS(dom_id), PRIV_LVL_USER)
#define GDT_SEL_LDT(dom_id) GDT_SEL(GDT_IDX_LDT(dom_id), PRIV_LVL_USER)
#endif /* CPU_X86_MM_GDT_LAYOUT_H_ */

59
cpu/x86/mm/ldt-layout.h Normal file
View File

@ -0,0 +1,59 @@
/*
* Copyright (C) 2015, Intel Corporation. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
* OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef CPU_X86_MM_LDT_LAYOUT_H_
#define CPU_X86_MM_LDT_LAYOUT_H_
#include "gdt-layout.h"
/* Each LDT can contain up to this many descriptors, but some protection
* domains may not use all of the slots.
*/
#define LDT_NUM_DESC 3
/**
* Provides access to kernel data. Most protection domains are granted at most
* read-only access, but the kernel protection domain is granted read/write
* access.
*/
#define LDT_IDX_KERN 0
/** Maps a device MMIO range */
#define LDT_IDX_MMIO 1
/** Maps domain-defined metadata */
#define LDT_IDX_META 2
#define LDT_SEL(idx, rpl) (GDT_SEL(idx, rpl) | (1 << 2))
#define LDT_SEL_KERN LDT_SEL(LDT_IDX_KERN, PRIV_LVL_USER)
#define LDT_SEL_MMIO LDT_SEL(LDT_IDX_MMIO, PRIV_LVL_USER)
#define LDT_SEL_META LDT_SEL(LDT_IDX_META, PRIV_LVL_USER)
#define LDT_SEL_STK LDT_SEL(LDT_IDX_STK, PRIV_LVL_USER)
#endif /* CPU_X86_MM_LDT_LAYOUT_H_ */

239
cpu/x86/mm/multi-segment.c Normal file
View File

@ -0,0 +1,239 @@
/*
* Copyright (C) 2015, Intel Corporation. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
* OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "gdt.h"
#include "helpers.h"
#include "prot-domains.h"
#include "segmentation.h"
#include "stacks.h"
/*---------------------------------------------------------------------------*/
static uint32_t
segment_desc_compute_base(segment_desc_t desc)
{
return (desc.base_hi << 24) | (desc.base_mid << 16) | desc.base_lo;
}
/*---------------------------------------------------------------------------*/
void
prot_domains_reg_multi_seg(volatile struct dom_kern_data ATTR_KERN_ADDR_SPACE *dkd,
uintptr_t mmio, size_t mmio_sz,
uintptr_t meta, size_t meta_sz)
{
segment_desc_t desc;
dom_id_t dom_id = PROT_DOMAINS_GET_DOM_ID(dkd);
uint32_t kern_data_len;
uint32_t tmp;
if((dkd < prot_domains_kern_data) ||
(prot_domains_kern_data_end <= dkd) ||
(((((uintptr_t)dkd) - (uintptr_t)prot_domains_kern_data) %
sizeof(dom_kern_data_t)) != 0)) {
halt();
}
KERN_READL(tmp, dkd->ldt[DT_SEL_GET_IDX(LDT_SEL_KERN)].raw_hi);
if(tmp != 0) {
/* This PDCS was previously initialized, which is disallowed. */
halt();
}
/* Initialize descriptors */
if(dom_id == DOM_ID_kern) {
kern_data_len = (uint32_t)&_ebss_kern_addr;
} else {
/* Non-kernel protection domains do not need to access the protection
* domain control structures, and they may contain saved register values
* that are private to each domain.
*/
kern_data_len = (uint32_t)&_ebss_syscall_addr;
}
kern_data_len -= (uint32_t)&_sbss_kern_addr;
segment_desc_init(&desc, (uint32_t)&_sbss_kern_addr, kern_data_len,
/* Every protection domain requires at least read-only access to kernel
data to read dom_client_data structures and to support the system call
dispatcher, if applicable. Only the kernel protection domain is granted
read/write access to the kernel data. */
((dom_id == DOM_ID_kern) ?
SEG_TYPE_DATA_RDWR :
SEG_TYPE_DATA_RDONLY) |
SEG_FLAG(DPL, PRIV_LVL_USER) |
SEG_GRAN_BYTE | SEG_DESCTYPE_NSYS);
KERN_WRITEL(dkd->ldt[LDT_IDX_KERN].raw_lo, desc.raw_lo);
KERN_WRITEL(dkd->ldt[LDT_IDX_KERN].raw_hi, desc.raw_hi);
if(mmio_sz != 0) {
if(SEG_MAX_BYTE_GRAN_LEN < mmio_sz) {
halt();
}
segment_desc_init(&desc, mmio, mmio_sz,
SEG_FLAG(DPL, PRIV_LVL_USER) | SEG_GRAN_BYTE |
SEG_DESCTYPE_NSYS | SEG_TYPE_DATA_RDWR);
} else {
desc.raw = SEG_DESC_NOT_PRESENT;
}
KERN_WRITEL(dkd->ldt[LDT_IDX_MMIO].raw_lo, desc.raw_lo);
KERN_WRITEL(dkd->ldt[LDT_IDX_MMIO].raw_hi, desc.raw_hi);
if(meta_sz != 0) {
if(SEG_MAX_BYTE_GRAN_LEN < meta_sz) {
halt();
}
segment_desc_init(&desc, meta, meta_sz,
SEG_FLAG(DPL, PRIV_LVL_USER) | SEG_GRAN_BYTE |
SEG_DESCTYPE_NSYS | SEG_TYPE_DATA_RDWR);
} else {
desc.raw = SEG_DESC_NOT_PRESENT;
}
KERN_WRITEL(dkd->ldt[LDT_IDX_META].raw_lo, desc.raw_lo);
KERN_WRITEL(dkd->ldt[LDT_IDX_META].raw_hi, desc.raw_hi);
segment_desc_init(&desc,
KERN_DATA_OFF_TO_PHYS_ADDR(dkd->ldt),
sizeof(dkd->ldt),
SEG_FLAG(DPL, PRIV_LVL_USER) | SEG_GRAN_BYTE |
SEG_DESCTYPE_SYS | SEG_TYPE_LDT);
gdt_insert(GDT_IDX_LDT(dom_id), desc);
}
/*---------------------------------------------------------------------------*/
void
prot_domains_gdt_init()
{
int i;
segment_desc_t desc;
segment_desc_init(&desc,
(uint32_t)&_stext_addr,
((uint32_t)&_etext_addr) - (uint32_t)&_stext_addr,
SEG_FLAG(DPL, PRIV_LVL_EXC) | SEG_GRAN_BYTE |
SEG_DESCTYPE_NSYS | SEG_TYPE_CODE_EX);
gdt_insert_boot(GDT_IDX_CODE_EXC, desc);
segment_desc_init(&desc,
(uint32_t)&_sdata_addr,
((uint32_t)&_edata_addr) - (uint32_t)&_sdata_addr,
SEG_FLAG(DPL, PRIV_LVL_USER) | SEG_GRAN_BYTE |
SEG_DESCTYPE_NSYS | SEG_TYPE_DATA_RDWR);
gdt_insert_boot(GDT_IDX_DATA, desc);
segment_desc_init(&desc,
(uint32_t)&_sbss_kern_addr,
((uint32_t)&_ebss_kern_addr) -
(uint32_t)&_sbss_kern_addr,
SEG_FLAG(DPL, PRIV_LVL_EXC) | SEG_GRAN_BYTE |
SEG_DESCTYPE_NSYS | SEG_TYPE_DATA_RDWR);
gdt_insert_boot(GDT_IDX_DATA_KERN_EXC, desc);
segment_desc_init(&desc,
(uint32_t)DATA_OFF_TO_PHYS_ADDR(stacks_main),
STACKS_SIZE_MAIN,
SEG_FLAG(DPL, PRIV_LVL_USER) | SEG_GRAN_BYTE |
SEG_DESCTYPE_NSYS | SEG_TYPE_DATA_RDWR);
gdt_insert_boot(GDT_IDX_STK, desc);
segment_desc_set_limit(&desc, STACKS_SIZE_MAIN + STACKS_SIZE_INT);
SEG_SET_FLAG(desc, DPL, PRIV_LVL_INT);
gdt_insert_boot(GDT_IDX_STK_INT, desc);
segment_desc_set_limit(&desc,
STACKS_SIZE_MAIN +
STACKS_SIZE_INT +
STACKS_SIZE_EXC);
SEG_SET_FLAG(desc, DPL, PRIV_LVL_EXC);
gdt_insert_boot(GDT_IDX_STK_EXC, desc);
/* Not all domains will necessarily be initialized, so this initially marks
* all per-domain descriptors not-present.
*/
desc.raw = SEG_DESC_NOT_PRESENT;
for(i = 0; i < PROT_DOMAINS_ACTUAL_CNT; i++) {
gdt_insert_boot(GDT_IDX_TSS(i), desc);
gdt_insert_boot(GDT_IDX_LDT(i), desc);
}
__asm__ __volatile__ (
"mov %[_default_data_], %%ds\n\t"
"mov %[_default_data_], %%es\n\t"
"mov %[_kern_data_], %%" SEG_KERN "s\n\t"
:
: [_default_data_] "r"(GDT_SEL_DATA),
[_kern_data_] "r"(GDT_SEL_DATA_KERN_EXC));
}
/*---------------------------------------------------------------------------*/
void
multi_segment_launch_kernel(void)
{
/* Update segment registers. */
__asm__ __volatile__ (
"mov %[_data_seg_], %%ds\n\t"
"mov %[_data_seg_], %%es\n\t"
"mov %[_kern_seg_], %%" SEG_KERN "s\n\t"
"mov %[_data_seg_], %%" SEG_META "s\n\t"
:
: [_data_seg_] "r" (GDT_SEL_DATA),
[_kern_seg_] "r" (LDT_SEL_KERN)
);
}
/*---------------------------------------------------------------------------*/
void
prot_domains_enable_mmio(void)
{
__asm__ __volatile__ ("mov %0, %%" SEG_MMIO "s" :: "r" (LDT_SEL_MMIO));
}
/*---------------------------------------------------------------------------*/
void
prot_domains_disable_mmio(void)
{
__asm__ __volatile__ ("mov %0, %%" SEG_KERN "s" :: "r" (LDT_SEL_KERN));
}
/*---------------------------------------------------------------------------*/
uintptr_t
prot_domains_lookup_meta_phys_base(dom_client_data_t ATTR_KERN_ADDR_SPACE *drv)
{
dom_id_t dom_id;
segment_desc_t desc;
volatile dom_kern_data_t ATTR_KERN_ADDR_SPACE *dkd;
KERN_READL(dom_id, drv->dom_id);
dkd = prot_domains_kern_data + dom_id;
KERN_READL(desc.raw_lo, dkd->ldt[DT_SEL_GET_IDX(LDT_SEL_META)].raw_lo);
KERN_READL(desc.raw_hi, dkd->ldt[DT_SEL_GET_IDX(LDT_SEL_META)].raw_hi);
return segment_desc_compute_base(desc);
}
/*---------------------------------------------------------------------------*/

195
cpu/x86/mm/multi-segment.h Normal file
View File

@ -0,0 +1,195 @@
/*
* Copyright (C) 2015, Intel Corporation. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
* OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef CPU_X86_MM_MULTI_SEGMENT_H_
#define CPU_X86_MM_MULTI_SEGMENT_H_
#include <stdint.h>
#include <stdlib.h>
#include "helpers.h"
#include "ldt-layout.h"
#ifdef __clang__
#define __SEG_FS
#define __seg_fs __attribute__((address_space(257)))
#define __SEG_GS
#define __seg_gs __attribute__((address_space(256)))
#endif
#ifdef __SEG_FS
#define ATTR_MMIO_ADDR_SPACE __seg_fs
#define ATTR_KERN_ADDR_SPACE __seg_fs
#else
#define ATTR_KERN_ADDR_SPACE
#endif
#ifdef __SEG_GS
#define ATTR_META_ADDR_SPACE __seg_gs
#endif
void prot_domains_reg_multi_seg(volatile struct dom_kern_data ATTR_KERN_ADDR_SPACE *dkd,
uintptr_t mmio, size_t mmio_sz,
uintptr_t meta, size_t meta_sz);
void multi_segment_launch_kernel(void);
#define MULTI_SEGMENT_ENTER_ISR(exc) \
"mov $" EXP_STRINGIFY(GDT_SEL_DATA) ", %%eax\n\t" \
/* Refresh DS and ES in case the userspace code corrupted them. */ \
"mov %%eax, %%ds\n\t" \
"mov %%eax, %%es\n\t" \
/* Refresh SEG_KERN. */ \
"mov $" EXP_STRINGIFY(LDT_SEL_KERN) ", %%eax\n\t" \
"mov %%eax, %%" SEG_KERN "s\n\t" \
".if " #exc "\n\t" \
/* It is possible that a routine performing MMIO is being interrupted. */ \
/* Thus, it is necessary to save and restore the MMIO segment register */ \
/* (in a callee-saved register). */ \
"mov %%" SEG_MMIO "s, %%ebp\n\t" \
"mov $" EXP_STRINGIFY(GDT_SEL_DATA_KERN_EXC) ", %%eax\n\t" \
"mov %%eax, %%" SEG_KERN "s\n\t" \
".endif\n\t"
#define MULTI_SEGMENT_LEAVE_ISR(exc) \
".if " #exc "\n\t" \
"mov %%ebp, %%" SEG_MMIO "s\n\t" \
".endif\n\t"
/**
* The MMIO region is tightly bounded within a segment, so its base offset is
* always 0.
*/
#define PROT_DOMAINS_MMIO(dcd) 0
/**
* The metadata region is tightly bounded within a segment, so its base offset
* is always 0.
*/
#define PROT_DOMAINS_META(dcd) 0
#define SEG_MMIO "f" /**< For MMIO accesses, when enabled. */
#define SEG_KERN "f" /**< For kernel data accesses */
#define SEG_META "g" /**< For metadata accesses */
#define _SEG_READL(seg, dst, src) \
__asm__ __volatile__ ( \
"movl %%" seg "s:%[src_], %[dst_]" : [dst_]"=r"(dst) : [src_]"m"(src))
#define _SEG_READW(seg, dst, src) \
__asm__ __volatile__ ( \
"movw %%" seg "s:%[src_], %[dst_]" : [dst_]"=r"(dst) : [src_]"m"(src))
#define _SEG_READB(seg, dst, src) \
__asm__ __volatile__ ( \
"movb %%" seg "s:%[src_], %[dst_]" : [dst_]"=q"(dst) : [src_]"m"(src))
#define _SEG_WRITEL(seg, dst, src) \
__asm__ __volatile__ ( \
"movl %[src_], %%" seg "s:%[dst_]" \
: [dst_]"=m"(dst) : [src_]"r"((uint32_t)(src)))
#define _SEG_WRITEW(seg, dst, src) \
__asm__ __volatile__ ( \
"movw %[src_], %%" seg "s:%[dst_]" \
: [dst_]"=m"(dst) : [src_]"r"((uint16_t)(src)))
#define _SEG_WRITEB(seg, dst, src) \
__asm__ __volatile__ ( \
"movb %[src_], %%" seg "s:%[dst_]" \
: [dst_]"=m"(dst) : [src_]"q"((uint8_t)(src)))
#ifndef __SEG_FS
#define MMIO_READL(dst, src) _SEG_READL(SEG_MMIO, dst, src)
#define MMIO_READW(dst, src) _SEG_READW(SEG_MMIO, dst, src)
#define MMIO_READB(dst, src) _SEG_READB(SEG_MMIO, dst, src)
#define MMIO_WRITEL(dst, src) _SEG_WRITEL(SEG_MMIO, dst, src)
#define MMIO_WRITEW(dst, src) _SEG_WRITEW(SEG_MMIO, dst, src)
#define MMIO_WRITEB(dst, src) _SEG_WRITEB(SEG_MMIO, dst, src)
#define KERN_READL(dst, src) _SEG_READL(SEG_KERN, dst, src)
#define KERN_READW(dst, src) _SEG_READW(SEG_KERN, dst, src)
#define KERN_READB(dst, src) _SEG_READB(SEG_KERN, dst, src)
#define KERN_WRITEL(dst, src) _SEG_WRITEL(SEG_KERN, dst, src)
#define KERN_WRITEW(dst, src) _SEG_WRITEW(SEG_KERN, dst, src)
#define KERN_WRITEB(dst, src) _SEG_WRITEB(SEG_KERN, dst, src)
#endif
#ifndef __SEG_GS
#define META_READL(dst, src) _SEG_READL(SEG_META, dst, src)
#define META_READW(dst, src) _SEG_READW(SEG_META, dst, src)
#define META_READB(dst, src) _SEG_READB(SEG_META, dst, src)
#define META_WRITEL(dst, src) _SEG_WRITEL(SEG_META, dst, src)
#define META_WRITEW(dst, src) _SEG_WRITEW(SEG_META, dst, src)
#define META_WRITEB(dst, src) _SEG_WRITEB(SEG_META, dst, src)
#endif
#define MEMCPY_FROM_META(dst, src, sz) \
{ \
uintptr_t __dst = (uintptr_t)(dst); \
uintptr_t __src = (uintptr_t)(src); \
size_t __sz = (size_t)(sz); \
__asm__ __volatile__ ( \
"rep movsb %%" SEG_META "s:(%%esi), %%es:(%%edi)\n\t" \
: "+D"(__dst), "+S"(__src), "+c"(__sz)); \
}
#define MEMCPY_TO_META(dst, src, sz) \
{ \
uintptr_t __dst = (uintptr_t)(dst); \
uintptr_t __src = (uintptr_t)(src); \
size_t __sz = (size_t)(sz); \
__asm__ __volatile__ ( \
"push %%es\n\t" \
"push %%" SEG_META "s\n\t" \
"pop %%es\n\t" \
"rep movsb\n\t" \
"pop %%es\n\t" \
: "+D"(__dst), "+S"(__src), "+c"(__sz)); \
}
/** Compute physical address from offset into kernel data space */
#define KERN_DATA_OFF_TO_PHYS_ADDR(x) \
(((uintptr_t)&_sbss_kern_addr) + (uintptr_t)(x))
/** Compute physical address from offset into default data space */
#define DATA_OFF_TO_PHYS_ADDR(x) \
(((uintptr_t)&_sdata_addr) + (uintptr_t)(x))
/** Compute kernel data offset from physical address in kernel data space */
#define PHYS_ADDR_TO_KERN_DATA_OFF(x) \
(((uintptr_t)(x)) - (uintptr_t)&_sbss_kern_addr)
/**
* In multi-segment protection domain implementations, it is sufficient to just
* compare incoming pointers against the frame pointer. All incoming pointers
* are dereferenced in the main data segment, which only maps the stacks and
* the shared data section. Since the shared data section is at a higher
* address range than the stacks, the frame pointer check is sufficient.
*/
#define PROT_DOMAINS_CHECK_INCOMING_PTR PROT_DOMAINS_CHECK_INCOMING_PTR_EBP
void prot_domains_enable_mmio(void);
void prot_domains_disable_mmio(void);
#endif /* CPU_X86_MM_MULTI_SEGMENT_H_ */

View File

@ -39,10 +39,12 @@
#include "stacks.h"
static dom_kern_data_t __attribute__((section(".kern_prot_dom_bss")))
PROT_DOMAINS_PDCS_NM(kern_dcd);
ATTR_KERN_ADDR_SPACE PROT_DOMAINS_PDCS_NM(kern_dcd);
PROT_DOMAINS_ALLOC_IMPL(kern_dcd);
static dom_client_data_t ATTR_BSS_KERN kern_dcd;
static dom_kern_data_t __attribute__((section(".app_prot_dom_bss")))
PROT_DOMAINS_PDCS_NM(app_dcd);
ATTR_KERN_ADDR_SPACE PROT_DOMAINS_PDCS_NM(app_dcd);
PROT_DOMAINS_ALLOC_IMPL(app_dcd);
static dom_client_data_t ATTR_BSS_KERN app_dcd;
/*---------------------------------------------------------------------------*/

View File

@ -40,6 +40,10 @@
#define X86_CONF_PROT_DOMAINS__NONE 0
#define X86_CONF_PROT_DOMAINS__PAGING 1
#define X86_CONF_PROT_DOMAINS__TSS 2
#define X86_CONF_PROT_DOMAINS_MULTI_SEG \
(X86_CONF_PROT_DOMAINS == X86_CONF_PROT_DOMAINS__TSS)
/** Privilege level (ring) for exception handlers and other supervisory code */
#define PRIV_LVL_EXC 0
@ -68,6 +72,49 @@ typedef uint32_t dom_id_t;
#if X86_CONF_PROT_DOMAINS == X86_CONF_PROT_DOMAINS__PAGING
#include "paging-prot-domains.h"
#elif X86_CONF_PROT_DOMAINS == X86_CONF_PROT_DOMAINS__TSS
#include "tss-prot-domains.h"
#endif
#ifndef ATTR_META_ADDR_SPACE
#define ATTR_META_ADDR_SPACE
#endif
#ifndef ATTR_MMIO_ADDR_SPACE
#define ATTR_MMIO_ADDR_SPACE
#endif
#ifndef ATTR_KERN_ADDR_SPACE
#define ATTR_KERN_ADDR_SPACE
#endif
#ifndef MMIO_READL
#define MMIO_READL(dst, src) dst = (src)
#define MMIO_READW(dst, src) dst = (src)
#define MMIO_READB(dst, src) dst = (src)
#define MMIO_WRITEL(dst, src) MMIO_READL(dst, src)
#define MMIO_WRITEW(dst, src) MMIO_READW(dst, src)
#define MMIO_WRITEB(dst, src) MMIO_READB(dst, src)
#endif
#ifndef KERN_READL
#define KERN_READL(dst, src) dst = (src)
#define KERN_READW(dst, src) dst = (src)
#define KERN_READB(dst, src) dst = (src)
#define KERN_WRITEL(dst, src) KERN_READL(dst, src)
#define KERN_WRITEW(dst, src) KERN_READW(dst, src)
#define KERN_WRITEB(dst, src) KERN_READB(dst, src)
#endif
#ifndef META_READL
#define META_READL(dst, src) dst = (src)
#define META_READW(dst, src) dst = (src)
#define META_READB(dst, src) dst = (src)
#define META_WRITEL(dst, src) META_READL(dst, src)
#define META_WRITEW(dst, src) META_READw(dst, src)
#define META_WRITEB(dst, src) META_READB(dst, src)
#endif
#ifndef MEMCPY_FROM_META
#define MEMCPY_FROM_META(dst, src, sz) \
memcpy((void *)(dst), (const void *)(src), (sz))
#define MEMCPY_TO_META(dst, src, sz) MEMCPY_FROM_META(dst, src, sz)
#endif
/* The following symbols are defined in the linker script */
@ -77,9 +124,9 @@ extern uint32_t _stext_addr, _etext_addr;
#if X86_CONF_PROT_DOMAINS != X86_CONF_PROT_DOMAINS__NONE
/** Metadata that should not be DMA-accessible */
#define ATTR_BSS_META __attribute__((section(".meta_bss")))
#define ATTR_BSS_META __attribute__((section(".meta_bss"))) ATTR_META_ADDR_SPACE
/** Kernel-owned data */
#define ATTR_BSS_KERN __attribute__((section(".kern_bss")))
#define ATTR_BSS_KERN __attribute__((section(".kern_bss"))) ATTR_KERN_ADDR_SPACE
/** Code that should only be executable during bootup */
#define ATTR_CODE_BOOT __attribute__((section(".boot_text")))
@ -97,6 +144,10 @@ extern uint32_t _ebss_syscall_addr;
/** Bounds for other data sections */
extern uint32_t _sdata_addr, _edata_addr;
#ifndef SEG_KERN
#define SEG_KERN "d"
#endif
/**
* If set, this protection domain is already in the call stack and is not
* available for nested invocations.
@ -114,8 +165,8 @@ extern uint32_t _sdata_addr, _edata_addr;
*/
typedef struct dom_kern_data dom_kern_data_t;
extern volatile dom_kern_data_t prot_domains_kern_data[];
extern volatile dom_kern_data_t prot_domains_kern_data_end[];
extern volatile dom_kern_data_t ATTR_KERN_ADDR_SPACE prot_domains_kern_data[];
extern volatile dom_kern_data_t ATTR_KERN_ADDR_SPACE prot_domains_kern_data_end[];
#define PROT_DOMAINS_ACTUAL_CNT \
(prot_domains_kern_data_end - prot_domains_kern_data)
@ -125,6 +176,7 @@ extern volatile dom_kern_data_t prot_domains_kern_data_end[];
void prot_domains_syscall_dispatcher(void);
#if X86_CONF_PROT_DOMAINS != X86_CONF_PROT_DOMAINS__TSS
/**
* Data associated with each protection domain that is owned by clients of that
* domain and used to identify the domain.
@ -132,15 +184,21 @@ void prot_domains_syscall_dispatcher(void);
struct dom_client_data {
dom_id_t dom_id;
} __attribute__((packed));
#endif
#ifndef PROT_DOMAINS_ALLOC_IMPL
#define PROT_DOMAINS_ALLOC_IMPL(nm)
#endif
/** Allocate the client-owned protection domain data structure. */
#define PROT_DOMAINS_PDCS_NM(nm) _pdcs_##nm
#define PROT_DOMAINS_ALLOC(typ, nm) \
static dom_kern_data_t __attribute__((section(".prot_dom_bss"))) \
PROT_DOMAINS_PDCS_NM(nm); \
ATTR_KERN_ADDR_SPACE PROT_DOMAINS_PDCS_NM(nm); \
PROT_DOMAINS_ALLOC_IMPL(nm); \
static typ ATTR_BSS_KERN nm
#define PROT_DOMAINS_INIT_ID(nm) \
(nm).dom_id = PROT_DOMAINS_GET_DOM_ID(&PROT_DOMAINS_PDCS_NM(nm))
KERN_WRITEL((nm).dom_id, PROT_DOMAINS_GET_DOM_ID(&PROT_DOMAINS_PDCS_NM(nm)))
/**
* Perform early initialization during boot stage 0 to prepare for boot stage 1
@ -169,8 +227,12 @@ void prot_domains_launch_kernel(void);
*/
#define PROT_DOMAINS_INIT_RET_ADDR_CNT 2
#if X86_CONF_PROT_DOMAINS == X86_CONF_PROT_DOMAINS__TSS
void prot_domains_launch_app(void);
#else
void app_main(void);
#define prot_domains_launch_app app_main
#endif
#else
@ -229,7 +291,7 @@ typedef struct dom_client_data dom_client_data_t;
* \param meta_sz Size of metadata
* \param pio Set to true if protection domain requires port IO access
*/
void prot_domains_reg(dom_client_data_t *dcd,
void prot_domains_reg(dom_client_data_t ATTR_KERN_ADDR_SPACE *dcd,
uintptr_t mmio,
size_t mmio_sz,
uintptr_t meta,
@ -237,11 +299,41 @@ void prot_domains_reg(dom_client_data_t *dcd,
bool pio);
#endif
#if X86_CONF_PROT_DOMAINS == X86_CONF_PROT_DOMAINS__NONE
#define prot_domains_copy_dcd(dst, src) *(dst) = *(src)
#else
static inline void
/**
* It is necessary to make a local copy of a dom_client_data structure when a
* multi-segment protection domain implementation is in use, segment attributes
* are not supported by the compiler, and a dom_client_data structure needs to
* be passed by value into some function. Otherwise, the compiler will not know
* to access the non-default segment in which *src is stored and will attempt
* to copy it out of the default data segment.
*/
prot_domains_copy_dcd(struct dom_client_data *dst,
struct dom_client_data ATTR_KERN_ADDR_SPACE *src)
{
KERN_READL(dst->dom_id, src->dom_id);
#if X86_CONF_PROT_DOMAINS == X86_CONF_PROT_DOMAINS__TSS
KERN_READL(dst->tss_sel, src->tss_sel);
#endif
}
#endif
#if !X86_CONF_PROT_DOMAINS_MULTI_SEG
#define prot_domains_enable_mmio()
#define prot_domains_disable_mmio()
#define KERN_DATA_OFF_TO_PHYS_ADDR(x) ((uintptr_t)(x))
#define DATA_OFF_TO_PHYS_ADDR(x) ((uintptr_t)(x))
#endif
#if X86_CONF_PROT_DOMAINS == X86_CONF_PROT_DOMAINS__NONE
#define prot_domains_lookup_meta_phys_base(drv) 0
#else
/** Lookup base physical address of metadata region for specified domain */
uintptr_t prot_domains_lookup_meta_phys_base(dom_client_data_t *drv);
uintptr_t prot_domains_lookup_meta_phys_base(dom_client_data_t ATTR_KERN_ADDR_SPACE *drv);
#endif
#if X86_CONF_PROT_DOMAINS != X86_CONF_PROT_DOMAINS__PAGING
@ -270,6 +362,11 @@ uintptr_t prot_domains_lookup_meta_phys_base(dom_client_data_t *drv);
".endif\n\t"
#endif
#ifdef X86_CONF_PROT_DOMAINS_MULTI_SEG
/* include GDT section definitions used when allocating protection domains: */
#include "gdt.h"
#endif
#endif /* !__ASSEMBLER__ */
#endif /* CPU_X86_MM_PROT_DOMAINS_H_ */

View File

@ -59,8 +59,11 @@
#define SEG_WIDTH_GRAN 1
#define SEG_SHAMT_GRAN 15
#define SEG_TYPE_DATA_RDONLY SEG_FLAG(TYPE, 0x00) /* Read only */
#define SEG_TYPE_DATA_RDWR SEG_FLAG(TYPE, 0x02) /* Read/Write */
#define SEG_TYPE_CODE_EXRD SEG_FLAG(TYPE, 0x0A) /* Execute/Read */
#define SEG_TYPE_CODE_EX SEG_FLAG(TYPE, 0x08) /* Execute only */
#define SEG_TYPE_LDT SEG_FLAG(TYPE, 0x02)
#define SEG_TYPE_TSS32_AVAIL SEG_FLAG(TYPE, 0x09)
#define SEG_DESCTYPE_SYS SEG_FLAG(DESCTYPE, 0)
@ -73,6 +76,12 @@
#define SEG_GRAN_BYTE SEG_FLAG(GRAN, 0)
#define SEG_GRAN_PAGE SEG_FLAG(GRAN, 1)
/**
* Maximum length of segment that can be regulated with a byte-granularity
* segment limit.
*/
#define SEG_MAX_BYTE_GRAN_LEN (1 << 20)
/**
* Segment descriptor. See Intel Combined Manual,
* Vol. 3, Section 3.4.5 for more details.
@ -91,7 +100,13 @@ typedef union segment_desc {
uint64_t raw;
} segment_desc_t;
static inline void
#define SEG_DESC_NOT_PRESENT 0
/* The next two functions are invoked by boot code, so they must always be
* inlined to avoid being placed in a different address space than the initial,
* flat address space.
*/
static inline void __attribute__((always_inline))
segment_desc_set_limit(segment_desc_t *c_this, uint32_t len)
{
uint32_t limit = len - 1;
@ -108,7 +123,7 @@ segment_desc_set_limit(segment_desc_t *c_this, uint32_t len)
* \param flags Flags to be added to the default flags: present, default
* operand size of 32 bits, and high limit bits.
*/
static inline void
static inline void __attribute__((always_inline))
segment_desc_init(segment_desc_t *c_this,
uint32_t base, uint32_t len, uint16_t flags)
{

View File

@ -61,6 +61,17 @@
#else
#define STACKS_SIZE_EXC 256
#endif
#elif X86_CONF_PROT_DOMAINS == X86_CONF_PROT_DOMAINS__TSS
/**
* This should be large enough to execute the exception handler with the
* largest stack requirement: double_fault_handler:
* - 1 word for the return address from calling double_fault_handler
* - 1 word for the saved frame pointer in double_fault_handler
* - 2 words that GCC has been observed to skip on the stack to align it
* to a preferred boundary
* - 1 word for the return address for calling halt
*/
#define STACKS_SIZE_EXC (STACKS_SIZE_INT + (6 * 4))
#else
#define STACKS_SIZE_EXC STACKS_SIZE_INT
#endif

View File

@ -33,6 +33,7 @@
#include "helpers.h"
#include "prot-domains.h"
#include <stdbool.h>
typedef uint32_t dom_id_bitmap_t;
@ -40,8 +41,8 @@ typedef struct syscalls_entrypoint {
uintptr_t entrypoint;
dom_id_bitmap_t doms;
} syscalls_entrypoint_t;
extern syscalls_entrypoint_t syscalls_entrypoints[];
extern syscalls_entrypoint_t syscalls_entrypoints_end[];
extern syscalls_entrypoint_t ATTR_KERN_ADDR_SPACE syscalls_entrypoints[];
extern syscalls_entrypoint_t ATTR_KERN_ADDR_SPACE syscalls_entrypoints_end[];
#define SYSCALLS_ACTUAL_CNT (syscalls_entrypoints_end - syscalls_entrypoints)
@ -49,11 +50,11 @@ extern syscalls_entrypoint_t syscalls_entrypoints_end[];
#define SYSCALLS_ALLOC_ENTRYPOINT(nm) \
syscalls_entrypoint_t __attribute__((section(".syscall_bss"))) \
_syscall_ent_##nm
ATTR_KERN_ADDR_SPACE _syscall_ent_##nm
#define SYSCALLS_INIT(nm) \
_syscall_ent_##nm.entrypoint = (uintptr_t)_syscall_##nm; \
_syscall_ent_##nm.doms = 0
KERN_WRITEL(_syscall_ent_##nm.entrypoint, (uintptr_t)_syscall_##nm); \
KERN_WRITEL(_syscall_ent_##nm.doms, 0)
#define SYSCALLS_DEFINE(nm, ...) \
void _syscall_##nm(__VA_ARGS__); \
@ -65,8 +66,19 @@ extern syscalls_entrypoint_t syscalls_entrypoints_end[];
SYSCALLS_STUB_SINGLETON(nm, dcd); \
void _syscall_##nm(__VA_ARGS__)
#define SYSCALLS_AUTHZ(nm, drv) _syscall_ent_##nm.doms |= BIT((drv).dom_id)
#define SYSCALLS_DEAUTHZ(nm, drv) _syscall_ent_##nm.doms &= ~BIT((drv).dom_id)
#define SYSCALLS_AUTHZ_UPD(nm, drv, set) \
{ \
dom_id_t _sc_tmp_id; \
dom_id_bitmap_t _sc_tmp_bm; \
KERN_READL(_sc_tmp_id, (drv).dom_id); \
KERN_READL(_sc_tmp_bm, _syscall_ent_##nm.doms); \
if(set) { \
_sc_tmp_bm |= BIT(_sc_tmp_id); \
} else { \
_sc_tmp_bm &= ~BIT(_sc_tmp_id); \
} \
KERN_WRITEL(_syscall_ent_##nm.doms, _sc_tmp_bm); \
}
/**
* Check that any untrusted pointer that could have been influenced by a caller
@ -78,7 +90,11 @@ extern syscalls_entrypoint_t syscalls_entrypoints_end[];
*
* This also checks that the pointer is either within the stack region or the
* shared data region, which is important for preventing redirection of data
* accesses to MMIO or metadata regions.
* accesses to MMIO or metadata regions. This check is omitted for multi-
* segment protection domain implementations, since the segment settings
* already enforce this property for pointers dereferenced in DS. Pointers
* that can be influenced by a caller should not be dereferenced in any other
* segment.
*
* The pointer is both validated and copied to a new storage location, which
* must be within the callee's local stack region (excluding the parameter
@ -92,6 +108,14 @@ extern syscalls_entrypoint_t syscalls_entrypoints_end[];
* references the return address, it could potentially redirect execution with
* the privileges of the callee protection domain.
*/
#if X86_CONF_PROT_DOMAINS_MULTI_SEG
#define PROT_DOMAINS_VALIDATE_PTR(validated, untrusted, sz) \
validated = untrusted; \
if(((uintptr_t)(validated)) < \
((2 * sizeof(uintptr_t)) + (uintptr_t)__builtin_frame_address(0))) { \
halt(); \
}
#else
#define PROT_DOMAINS_VALIDATE_PTR(validated, untrusted, sz) \
validated = untrusted; \
if((((uintptr_t)(validated)) < \
@ -99,6 +123,7 @@ extern syscalls_entrypoint_t syscalls_entrypoints_end[];
(((uintptr_t)&_edata_addr) <= (((uintptr_t)(validated)) + (sz)))) { \
halt(); \
}
#endif
#else
@ -106,10 +131,12 @@ extern syscalls_entrypoint_t syscalls_entrypoints_end[];
#define SYSCALLS_INIT(nm)
#define SYSCALLS_DEFINE(nm, ...) void nm(__VA_ARGS__)
#define SYSCALLS_DEFINE_SINGLETON(nm, dcd, ...) void nm(__VA_ARGS__)
#define SYSCALLS_AUTHZ(nm, drv)
#define SYSCALLS_DEAUTHZ(nm, drv)
#define SYSCALLS_AUTHZ_UPD(nm, drv, set)
#define PROT_DOMAINS_VALIDATE_PTR(validated, untrusted, sz) validated = untrusted
#endif
#define SYSCALLS_AUTHZ(nm, drv) SYSCALLS_AUTHZ_UPD(nm, drv, true)
#define SYSCALLS_DEAUTHZ(nm, drv) SYSCALLS_AUTHZ_UPD(nm, drv, false)
#endif /* CPU_X86_MM_SYSCALLS_H_ */

View File

@ -0,0 +1,88 @@
/*
* Copyright (C) 2015, Intel Corporation. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
* OF THE POSSIBILITY OF SUCH DAMAGE.
*/
.text
/* Initialize the TSS fields in prot_domains_reg accordingly:
* Note: Each of these must be a callee-saved register, so that they are
* restored to their original values prior to the task returning. This will
* result in the same values being loaded when the task is next invoked.
*/
#define CUR_DOM_ID_BITMAP esi
/* Must match SEG_KERN (plus the trailing 's') in multi-segment.h */
#define SEG_KERN fs
.global prot_domains_syscall_dispatcher
prot_domains_syscall_dispatcher:
#define PROT_DOMAINS_SYSCALL eax
mov prot_domains_syscall, %PROT_DOMAINS_SYSCALL
cmp $syscalls_entrypoints, %PROT_DOMAINS_SYSCALL
jl halt
cmp $syscalls_entrypoints_end, %PROT_DOMAINS_SYSCALL
jnl halt
#define SYSCALLS_ENTRYPOINTS_ALIGN_MASK ebp
mov $3, %SYSCALLS_ENTRYPOINTS_ALIGN_MASK
and %PROT_DOMAINS_SYSCALL, %SYSCALLS_ENTRYPOINTS_ALIGN_MASK
jnz halt
/* Compare allowed domains bitmask against current domain ID bitmap. If
* the check fails, then the current domain ID bitmap value will be zeroed
* out, which could cause incorrect behavior in the future. However, the
* response to a failed check is to halt the system, so destroying the
* current domain ID bitmap value will have no effect.
*/
and %SEG_KERN:4(%PROT_DOMAINS_SYSCALL), %CUR_DOM_ID_BITMAP
jz halt
mov prot_domains_main_esp, %esp
/* Must be a callee-saved register: */
#define ORIG_RET_ADDR edi
/* Update the caller's stack to return back to here */
pop %ORIG_RET_ADDR
push $sysret_dispatcher
/* Jump to the system call body */
jmp *%SEG_KERN:(%PROT_DOMAINS_SYSCALL)
sysret_dispatcher:
push %ORIG_RET_ADDR
iret
/* The task will resume here for the next system call, so it is necessary
* to jump back to the top.
*/
jmp prot_domains_syscall_dispatcher
.global dev_not_avail_isr
dev_not_avail_isr:
clts
iret

View File

@ -0,0 +1,161 @@
/*
* Copyright (C) 2015-2016, Intel Corporation. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
* OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <stdint.h>
#include <string.h>
#include "gdt.h"
#include "helpers.h"
#include "idt.h"
#include "prot-domains.h"
#include "stacks.h"
#include "syscalls.h"
#include "tss.h"
uint32_t prot_domains_main_esp;
syscalls_entrypoint_t ATTR_KERN_ADDR_SPACE *prot_domains_syscall;
/*---------------------------------------------------------------------------*/
void app_main(void);
void
prot_domains_reg(dom_client_data_t ATTR_KERN_ADDR_SPACE *dcd,
uintptr_t mmio, size_t mmio_sz,
uintptr_t meta, size_t meta_sz,
bool pio)
{
segment_desc_t desc;
uint32_t eflags;
dom_id_t dom_id;
volatile struct dom_kern_data ATTR_KERN_ADDR_SPACE *dkd;
KERN_READL(dom_id, dcd->dom_id);
dkd = prot_domains_kern_data + dom_id;
prot_domains_reg_multi_seg(dkd, mmio, mmio_sz, meta, meta_sz);
/* Only the kernel protection domain requires port I/O access outside of the
* interrupt handlers.
*/
eflags = EFLAGS_IOPL(pio ? PRIV_LVL_USER : PRIV_LVL_INT);
if(dom_id == DOM_ID_app) {
eflags |= EFLAGS_IF;
}
/* Keep this initialization in sync with the register definitions in
* tss-prot-domains-asm.S.
*/
KERN_WRITEL(dkd->tss.ebp, 0);
KERN_WRITEL(dkd->tss.ebx, 0);
KERN_WRITEL(dkd->tss.esi, BIT(dom_id));
KERN_WRITEL(dkd->tss.eip,
(dom_id == DOM_ID_app) ?
(uint32_t)app_main :
(uint32_t)prot_domains_syscall_dispatcher);
KERN_WRITEL(dkd->tss.cs, GDT_SEL_CODE);
KERN_WRITEL(dkd->tss.ds, GDT_SEL_DATA);
KERN_WRITEL(dkd->tss.es, GDT_SEL_DATA);
KERN_WRITEL(dkd->tss.fs, LDT_SEL_KERN);
KERN_WRITEL(dkd->tss.gs,
(meta_sz == 0) ? GDT_SEL_NULL : LDT_SEL_META);
KERN_WRITEL(dkd->tss.ss, GDT_SEL_STK);
/* This stack pointer is only actually used in application protection domain.
* Other domains enter at system call dispatcher, which switches to main
* stack.
*/
KERN_WRITEL(dkd->tss.esp,
/* Two return addresses have been consumed: */
STACKS_INIT_TOP + (2 * sizeof(uintptr_t)));
KERN_WRITEL(dkd->tss.eflags, eflags);
KERN_WRITEL(dkd->tss.ldt, GDT_SEL_LDT(dom_id));
KERN_WRITEL(dkd->tss.esp2, STACKS_SIZE_MAIN + STACKS_SIZE_INT);
KERN_WRITEL(dkd->tss.ss2, GDT_SEL_STK_INT);
KERN_WRITEL(dkd->tss.esp0,
STACKS_SIZE_MAIN + STACKS_SIZE_INT + STACKS_SIZE_EXC);
KERN_WRITEL(dkd->tss.ss0, GDT_SEL_STK_EXC);
KERN_WRITEW(dkd->tss.t, 0);
KERN_WRITEW(dkd->tss.iomap_base, sizeof(tss_t));
KERN_WRITEL(dkd->tss.cr3, 0);
segment_desc_init(&desc,
KERN_DATA_OFF_TO_PHYS_ADDR((uint32_t)&(dkd->tss)),
sizeof(dkd->tss),
/* It should be possible for code at any privilege level to invoke the task's
* system call dispatcher.
*/
SEG_FLAG(DPL, PRIV_LVL_USER) | SEG_TYPE_TSS32_AVAIL);
gdt_insert(GDT_IDX_TSS(dom_id), desc);
KERN_WRITEW(dcd->tss_sel, GDT_SEL(GDT_IDX_TSS(dom_id), PRIV_LVL_USER));
}
/*---------------------------------------------------------------------------*/
void dev_not_avail_isr(void);
void
prot_domains_impl_init(void)
{
__asm__ __volatile__ ("ltr %0" :: "r" ((uint16_t)GDT_SEL_TSS(DOM_ID_kern)));
__asm__ __volatile__ ("lldt %0" :: "r" ((uint16_t)GDT_SEL_LDT(DOM_ID_kern)));
idt_set_intr_gate_desc(7,
(uint32_t)dev_not_avail_isr,
GDT_SEL_CODE_EXC, PRIV_LVL_EXC);
}
/*---------------------------------------------------------------------------*/
int main();
void
prot_domains_launch_kernel(void)
{
multi_segment_launch_kernel();
/* Activate kernel protection domain, entering the kernel at main. */
__asm__ __volatile__ (
"pushl %[_ss_]\n\t"
"pushl %[_top_of_stk_]\n\t"
"pushl %[_eflags_]\n\t"
"pushl %[_cs_]\n\t"
"pushl %[_kern_start_]\n\t"
"iretl\n\t"
:
: [_ss_] "g" (GDT_SEL_STK),
[_eflags_] "g" (EFLAGS_IOPL(PRIV_LVL_USER)),
[_cs_] "g" (GDT_SEL_CODE),
[_kern_start_] "g" (main),
/* one address has already been consumed */
[_top_of_stk_] "g" (STACKS_INIT_TOP + sizeof(uint32_t))
);
}
/*---------------------------------------------------------------------------*/
void
prot_domains_launch_app()
{
far_pointer_t app_ptr = { 0, GDT_SEL_TSS(DOM_ID_app) };
__asm__ __volatile__ ("ljmp *%0" :: "m" (app_ptr));
}
/*---------------------------------------------------------------------------*/

View File

@ -0,0 +1,130 @@
/*
* Copyright (C) 2015-2016, Intel Corporation. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
* OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef CPU_X86_MM_TSS_PROT_DOMAINS_H_
#define CPU_X86_MM_TSS_PROT_DOMAINS_H_
#include <stdbool.h>
#include <stdint.h>
#include <stdlib.h>
#include "ldt-layout.h"
#include "segmentation.h"
#include "tss.h"
struct dom_kern_data {
/** Task State Segment */
tss_t tss;
/** Local Descriptor Table with per-domain descriptors */
segment_desc_t ldt[LDT_NUM_DESC];
} __attribute__((packed));
/* relies on dom_kern_data: */
#include "multi-segment.h"
/* relies on ATTR_KERN_ADDR_SPACE: */
#include "syscalls.h"
/**
* Data associated with each protection domain that is owned by clients of that
* domain and used to identify the domain.
*/
struct dom_client_data {
dom_id_t dom_id;
/** The selector is only 16 bits, but it is padded to 32 bits. */
uint32_t tss_sel;
};
extern uint32_t prot_domains_main_esp;
#define SYSCALLS_STUB_MIDDLE(nm) \
/* If already in the callee protection domain, skip the protection */ \
/* domain switch and directly invoke the system call body */ \
" je _syscall_" #nm "\n\t" \
" movl $" EXP_STRINGIFY(_syscall_ent_##nm) ", prot_domains_syscall\n\t" \
" mov %esp, prot_domains_main_esp\n\t"
#define SYSCALLS_STUB(nm) \
SYSCALLS_ALLOC_ENTRYPOINT(nm); \
asm ( \
".text\n\t" \
".global " #nm "\n\t" \
#nm ":\n\t" \
" str %ax\n\t" \
/* Compare current Task Register selector to selector for callee */ \
/* protection domain, in tss_sel field of dom_client_data */ \
" cmpw %ax, 8(%esp)\n\t" \
SYSCALLS_STUB_MIDDLE(nm) \
/* This will treat the dom_id field as the offset for the call, but */ \
/* that is ignored when performing a far call to a task */ \
" lcall *4(%esp)\n\t" \
" ret\n\t")
#define SYSCALLS_STUB_SINGLETON(nm, dcd) \
SYSCALLS_ALLOC_ENTRYPOINT(nm); \
asm ( \
".text\n\t" \
".global " #nm "\n\t" \
#nm ":\n\t" \
" str %ax\n\t" \
/* Compare current Task Register selector to selector for callee */ \
/* protection domain, in tss_sel field of dom_client_data */ \
" cmpw %ax, %" SEG_KERN "s:(4 + " #dcd ")\n\t" \
SYSCALLS_STUB_MIDDLE(nm) \
/* This will treat the dom_id field as the offset for the call, but */ \
/* that is ignored when performing a far call to a task */ \
" lcall *%" SEG_KERN "s:" #dcd "\n\t" \
" ret\n\t")
#define PROT_DOMAINS_ENTER_ISR(exc) \
MULTI_SEGMENT_ENTER_ISR(exc) \
/* It is possible that the system call dispatcher is being interrupted, */ \
/* and some interrupt handlers perform system calls. Thus, it is */ \
/* necessary to save and restore the system call dispatcher parameters */ \
/* (in callee-saved registers). */ \
"mov prot_domains_main_esp, %%esi\n\t" \
"mov prot_domains_syscall, %%edi\n\t" \
PROT_DOMAINS_ENTER_ISR_COMMON(exc)
#define PROT_DOMAINS_LEAVE_ISR(exc) \
PROT_DOMAINS_LEAVE_ISR_COMMON(exc) \
"mov %%edi, prot_domains_syscall\n\t" \
"mov %%esi, prot_domains_main_esp\n\t" \
MULTI_SEGMENT_LEAVE_ISR(exc)
/* Allocate two additional GDT entries for each protection domain. Note that
* the particular storage allocated by this statement may actually be used for
* some other protection domain, depending on how the linker happens to arrange
* all of the GDT storage. The GDT_IDX_TSS and GDT_IDX_LDT macros in
* gdt-layout.h determine which storage is used for each protection domain.
* Thus, this storage should not be referenced directly by its variable name.
*/
#define PROT_DOMAINS_ALLOC_IMPL(nm) \
static segment_desc_t ATTR_BSS_GDT_MID _gdt_storage_##nm[2]
#endif /* CPU_X86_MM_TSS_PROT_DOMAINS_H_ */

View File

@ -87,4 +87,6 @@ SECTIONS {
*/
_ebss_gdt_addr = .;
}
_ebss_pre_dma_addr = ALIGN(32);
}

View File

@ -30,26 +30,18 @@
SECTIONS {
/*
It would be more natural to use a 1K alignment for this entire section.
However, the UEFI GenFw program ratchets up its alignment
granularity to the maximum granularity discovered in its input file.
Using 1K-alignment perturbs the symbols, hindering debugging. Thus,
this section is simply padded out to the desired alignment and
declared to have a section alignment of only 32 bytes.
The alignment directives used here suffice even when paging is in use,
because this is the last section and directly follows one (.bss.meta)
that is 4K-aligned.
*/
.bss.dma (NOLOAD) : ALIGN (32)
.bss.dma (NOLOAD) : AT(_ebss_pre_dma_addr) ALIGN (32)
{
/* The IMR feature operates at 1K granularity. */
. = ALIGN(1K);
_sbss_dma_addr = .;
/* IMRs are used to restrict DMA, and they require 1K physical address alignment. */
. += ALIGN(_ebss_pre_dma_addr, 1K) - ALIGN(_ebss_pre_dma_addr, 32);
*(.dma_bss)
. = ALIGN(1K);
_ebss_dma_addr = .;
}
_sbss_dma_addr = LOADADDR(.bss.dma) + ALIGN(_ebss_pre_dma_addr, 1K) - ALIGN(_ebss_pre_dma_addr, 32);
/*
Effectively pointing beyond the end of .bss.dma is acceptable, since
.bss.dma is the last section in memory.
*/
_ebss_dma_addr = ALIGN(LOADADDR(.bss.dma) + SIZEOF(.bss.dma), 1K);
}

View File

@ -0,0 +1,190 @@
/*
* Copyright (C) 2015, Intel Corporation. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
* OF THE POSSIBILITY OF SUCH DAMAGE.
*/
OUTPUT_FORMAT("elf32-i386")
ENTRY(start)
/*
The TSS-based protection domain implementation does not explicitly reference
these symbols, so we list them here to prevent them from being garbage-
collected.
*/
EXTERN(stacks_int)
EXTERN(stacks_exc)
PHDRS {
boot_text PT_LOAD;
text PT_LOAD;
data PT_LOAD;
}
SECTIONS {
/*
OS-Dev Wiki says it is common for kernels to start at 1M. Addresses before that
are used by BIOS/EFI, the bootloader and memory-mapped I/O.
The UEFI GenFw program inserts a 0x220 byte offset between the image base and
the .text section. We add that same offset here to align the symbols in the
UEFI DLL with those in the final UEFI binary to make debugging easier.
*/
. = 1M + 0x220;
/*
The GenFw program in the EDK2 UEFI toolchain outputs UEFI images with a
section alignment of at least 32 bytes. Thus, it is desirable to use at
least that alignment granularity to avoid symbols being shifted from the
intermediate DLL to the final UEFI image. Such shifting may make
debugging more difficult by preventing the DLL from being a useful
source of symbol information. The debugging symbols are not included in
the final UEFI image.
*/
.text.boot : ALIGN (32)
{
*(.multiboot)
/*
The initial bootstrap code expects to operate in a flat address
space with an identity mapping between linear and physical
addresses.
*/
*(.boot_text)
} :boot_text
/* The post-boot code segments define tight bounds around the code
section, so this directive resets the virtual address to 0. */
. = 0;
/* The virtual address differs from the load address. */
.text : AT(LOADADDR(.text.boot) + ALIGN(SIZEOF(.text.boot), 32)) ALIGN (32)
{
/*
These BYTE directives emit a UD2 instruction to cause execution to
halt if the control flow ever deviates to address 0. This also
prevents other code from being placed at address 0. Some code
considers a function pointer to address 0 to be a null function
pointer.
*/
BYTE(0x0F);
BYTE(0x0B);
*(.text*)
/*
An alternative design to eliminate the need for ALIGN directives
within the AT directives in later sections could have padded
each section out to a 32-byte boundary. However, that would have
enabled unneeded software accesses to the padding past the end of actual
code/data in each section, since segments are also configured based on
the values of the SIZEOF expressions. As a general principle, accesses
should be as restricted as is feasible.
*/
} :text
_stext_addr = LOADADDR(.text);
_etext_addr = LOADADDR(.text) + SIZEOF(.text);
. = 0;
.data : AT(ALIGN(_etext_addr, 32)) ALIGN (32)
{
*(.main_stack)
*(.int_stack)
*(.exc_stack)
*(.rodata*)
*(.data*)
/*
These could alternatively be treated as read-only data to prevent tampering
from the user privilege level.
*/
_sdata_shared_isr = .;
KEEP(*(.shared_isr_data*))
_edata_shared_isr = .;
} :data
.bss : ALIGN (32)
{
*(COMMON)
*(.bss*)
}
_sdata_addr = LOADADDR(.data);
_edata_addr = LOADADDR(.bss) + SIZEOF(.bss);
. = 0;
.bss.kern (NOLOAD) : AT(ALIGN(_edata_addr, 32)) ALIGN (32)
{
/*
This directive prevents any data from being allocated at address
zero, since the address 0 is commonly used to represent null
pointers.
*/
LONG(0);
*(.kern_bss)
syscalls_entrypoints = .;
*(.syscall_bss)
syscalls_entrypoints_end = .;
}
_ebss_syscall_addr = LOADADDR(.bss.kern) + SIZEOF(.bss.kern);
.bss.kern_priv (NOLOAD) : ALIGN (32)
{
prot_domains_kern_data = .;
/*
The kernel and app protection domain control structures must always
be placed in the first two slots in this order, so that they have
well-known protection domain IDs:
*/
*(.kern_prot_dom_bss)
*(.app_prot_dom_bss)
*(.prot_dom_bss)
prot_domains_kern_data_end = .;
*(.gdt_bss_start)
KEEP(*(.gdt_bss_mid))
*(.gdt_bss)
_ebss_gdt_addr = .;
}
_sbss_kern_addr = LOADADDR(.bss.kern);
_ebss_kern_addr = LOADADDR(.bss.kern_priv) + SIZEOF(.bss.kern_priv);
. = _ebss_kern_addr;
.bss.meta (NOLOAD) : AT(ALIGN(_ebss_kern_addr, 32)) ALIGN (32)
{
*(.meta_bss)
}
/* .bss.meta may be empty, so this uses .bss.kern_priv as a base instead: */
_ebss_pre_dma_addr = ALIGN(ALIGN(_ebss_kern_addr, 32) + SIZEOF(.bss.meta), 32);
}

View File

@ -129,7 +129,7 @@ SECTIONS {
*(.data*)
/*
These could also be treated as read-only data to prevent tampering
These could alternatively be treated as read-only data to prevent tampering
from the user privilege level.
*/
_sdata_shared_isr = .;
@ -201,4 +201,6 @@ SECTIONS {
. = ALIGN(4K);
}
_ebss_pre_dma_addr = ALIGN(32);
}

View File

@ -9,7 +9,17 @@ MULTIBOOT = $(CONTIKI_PROJECT).$(MULTIBOOT_SFX)
# UEFI binary
UEFI_DLL_SFX = $(TARGET).dll
UEFI_DLL = $(CONTIKI_PROJECT).$(UEFI_SFX)
UEFI_LDFLAGS += -Xlinker --emit-relocs -Xlinker --entry=uefi_start
# The GenFw program is unable to process absolute symbols like _stext_addr,
# etc., that are defined in quarkX1000_dma.ld and quarkX1000_multi_seg.ld
# and used to configure segments in multi-segment.c, etc. Furthermore,
# relocating the UEFI image during load would result in those symbols not
# pointing to the expected image locations. So, relocation data is omitted
# from the intermediate UEFI DLL. This will only result in a
# correctly-functioning build if the UEFI firmware does not attempt to
# relocate the UEFI image, so it may be desirable in the future to revisit
# this design. To emit relocation data, '-Xlinker --emit-relocs' should be
# appended to the following line.
UEFI_LDFLAGS = -Xlinker --entry=uefi_start
UEFI_SFX = $(TARGET).efi
UEFI = $(CONTIKI_PROJECT).$(UEFI_SFX)