Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ Following environment variables control the behavior of DTO library:
DTO_DSA_MEMSET=0/1, 1 (default) - DTO uses DSA to process memset, 0 - DTO use system memset
DTO_DSA_MEMCMP=0/1, 1 (default) - DTO uses DSA to process memcmp, 0 - DTO use system memcmp
DTO_DSA_CC=0/1, 1 (default) - DTO sets DSA Cache Control flag to 1 if DSA supports cache control, 0 - DTO sets DSA Cache Control flag to 0
DTO_OVERLAPPING_MEMMOVE_ACTION=0/1 0 (default) DTO submits memmove operations with overlapping buffers entirely to CPU, 1 - entirely to DSA
DTO_UMWAIT_DELAY=xxxx defines delay for umwait command (check max possible value at: /sys/devices/system/cpu/umwait_control/max_time), default is 100000
DTO_LOG_FILE=<dto log file path> Redirect the DTO output to the specified file instead of std output (useful for debugging and statistics collection). file name is suffixed by process pid.
DTO_LOG_LEVEL=0/1/2 controls the log level. higher value means more verbose logging (default 0).
Expand Down
151 changes: 98 additions & 53 deletions dto.c
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,12 @@ enum numa_aware {
NA_LAST_ENTRY
};

enum overlapping_memmove_actions {
OVERLAPPING_CPU = 0,
OVERLAPPING_DSA,
OVERLAPPING_LAST_ENTRY
};

static const char * const numa_aware_names[] = {
[NA_NONE] = "none",
[NA_BUFFER_CENTRIC] = "buffer-centric",
Expand Down Expand Up @@ -117,6 +123,8 @@ static uint8_t dto_dsa_cc = 1;

static unsigned long dto_umwait_delay = UMWAIT_DELAY_DEFAULT;

static uint8_t dto_overlapping_memmove_action = OVERLAPPING_CPU;

static uint8_t fork_handler_registered;

enum memop {
Expand Down Expand Up @@ -188,17 +196,17 @@ static struct timespec dto_start_time;
} while (0) \


#define DTO_COLLECT_STATS_DSA_END(cs, st, et, op, n, tbc, r) \
#define DTO_COLLECT_STATS_DSA_END(cs, st, et, op, n, overlap, tbc, r) \
do { \
if (unlikely(cs)) { \
uint64_t t; \
clock_gettime(CLOCK_BOOTTIME, &et); \
t = (((et.tv_sec*1000000000) + et.tv_nsec) - \
((st.tv_sec*1000000000) + st.tv_nsec)); \
if (unlikely(r != SUCCESS)) \
update_stats(op, n, tbc, t, DSA_CALL_FAILED, r); \
update_stats(op, n, overlap, tbc, t, DSA_CALL_FAILED, r); \
else \
update_stats(op, n, tbc, t, DSA_CALL_SUCCESS, 0); \
update_stats(op, n, overlap, tbc, t, DSA_CALL_SUCCESS, 0); \
} \
} while (0) \

Expand All @@ -209,7 +217,7 @@ static struct timespec dto_start_time;
clock_gettime(CLOCK_BOOTTIME, &et); \
t = (((et.tv_sec*1000000000) + et.tv_nsec) - \
((st.tv_sec*1000000000) + st.tv_nsec)); \
update_stats(op, orig_n, n, t, STDC_CALL, 0); \
update_stats(op, orig_n, false, n, t, STDC_CALL, 0); \
} \
} while (0) \

Expand Down Expand Up @@ -513,10 +521,7 @@ static __always_inline int dsa_execute(struct dto_wq *wq,
ret = 0;
}
if (!ret) {
if (auto_adjust_knobs)
dsa_wait_and_adjust(comp);
else
dsa_wait_no_adjust(comp);
dsa_wait_no_adjust(comp);

if (*comp == DSA_COMP_SUCCESS) {
thr_bytes_completed += hw->xfer_size;
Expand All @@ -532,9 +537,14 @@ static __always_inline int dsa_execute(struct dto_wq *wq,
}

#ifdef DTO_STATS_SUPPORT
static void update_stats(int op, size_t n, size_t bytes_completed,
static void update_stats(int op, size_t n, bool overlapping, size_t bytes_completed,
uint64_t elapsed_ns, int group, int error_code)
{
// dto_memcpymove didn't actually submit the request to DSA, so there is nothing to log. This will be captured by a second call
if (op == MEMMOVE && overlapping && dto_overlapping_memmove_action == OVERLAPPING_CPU && group == DSA_CALL_SUCCESS) {
return;
}

int bucket = (n / HIST_BUCKET_SIZE);

if (bucket >= HIST_NO_BUCKETS) /* last bucket includes remaining sizes */
Expand Down Expand Up @@ -1240,6 +1250,14 @@ static int init_dto(void)
dto_dsa_memcmp = !!dto_dsa_memcmp;
}

env_str = getenv("DTO_OVERLAPPING_MEMMOVE_ACTION");
if (env_str != NULL) {
errno = 0;
dto_overlapping_memmove_action = strtoul(env_str, NULL, 10);
if (errno)
dto_overlapping_memmove_action = OVERLAPPING_CPU;
}

#ifdef DTO_STATS_SUPPORT
env_str = getenv("DTO_COLLECT_STATS");
if (env_str != NULL) {
Expand Down Expand Up @@ -1484,55 +1502,75 @@ static bool is_overlapping_buffers (void *dest, const void *src, size_t n)
return true;
}

static void dto_memcpymove(void *dest, const void *src, size_t n, bool is_memcpy, int *result)
static bool dto_memcpymove(void *dest, const void *src, size_t n, bool is_memcpy, int *result)
{
struct dto_wq *wq = get_wq(dest);
struct dto_wq *wq;
size_t cpu_size, dsa_size;
bool is_overlapping;

thr_desc.opcode = DSA_OPCODE_MEMMOVE;
thr_desc.flags = IDXD_OP_FLAG_CRAV | IDXD_OP_FLAG_RCR;
if (dto_dsa_cc && (wq->dsa_gencap & GENCAP_CC_MEMORY))
thr_desc.flags |= IDXD_OP_FLAG_CC;
thr_desc.completion_addr = (uint64_t)&thr_comp;
thr_bytes_completed = 0;

/* cpu_size_fraction guaranteed to be >= 0 and < 1 */
if (!is_memcpy && is_overlapping_buffers(dest, src, n))
if (!is_memcpy && is_overlapping_buffers(dest, src, n)) {
cpu_size = 0;
else
is_overlapping = true;
} else {
/* cpu_size_fraction guaranteed to be >= 0 and < 1 */
cpu_size = n * cpu_size_fraction / 100;
is_overlapping = false;
}

// If this is an overlapping memmove and the action is to perform on CPU, return having done nothing and
// memmove will perform the copy and correctly attribute statistics to stdlib call group
if (is_overlapping && dto_overlapping_memmove_action == OVERLAPPING_CPU) {
*result = SUCCESS;
return true;
}

dsa_size = n - cpu_size;
wq = get_wq(dest);

thr_bytes_completed = 0;
thr_desc.opcode = DSA_OPCODE_MEMMOVE;
thr_desc.flags = IDXD_OP_FLAG_CRAV | IDXD_OP_FLAG_RCR;
if (dto_dsa_cc && (wq->dsa_gencap & GENCAP_CC_MEMORY))
thr_desc.flags |= IDXD_OP_FLAG_CC;
thr_desc.completion_addr = (uint64_t)&thr_comp;

if (dsa_size <= wq->max_transfer_size) {
thr_desc.src_addr = (uint64_t) src + cpu_size;
thr_desc.dst_addr = (uint64_t) dest + cpu_size;
thr_desc.xfer_size = (uint32_t) dsa_size;
thr_comp.status = 0;
*result = dsa_submit(wq, &thr_desc);
if (*result == SUCCESS) {
if (cpu_size) {
if (is_memcpy)
orig_memcpy(dest, src, cpu_size);
else
orig_memmove(dest, src, cpu_size);
thr_bytes_completed += cpu_size;
if (is_overlapping) {
*result = dsa_execute(wq, &thr_desc, &thr_comp.status);
} else {
*result = dsa_submit(wq, &thr_desc);
if (*result == SUCCESS) {
if (cpu_size) {
if (is_memcpy)
orig_memcpy(dest, src, cpu_size);
else
orig_memmove(dest, src, cpu_size);
thr_bytes_completed += cpu_size;
}
*result = dsa_wait(wq, &thr_desc, &thr_comp.status);
}
*result = dsa_wait(wq, &thr_desc, &thr_comp.status);
}
} else {
uint32_t threshold;
size_t current_cpu_size_fraction = cpu_size_fraction; // the cpu_size_fraction might be changed by the auto tune algorithm
threshold = wq->max_transfer_size * 100 / (100 - current_cpu_size_fraction);
if (is_overlapping) {
threshold = wq->max_transfer_size;
cpu_size = 0;
} else {
threshold = wq->max_transfer_size * 100 / (100 - current_cpu_size_fraction);
}

do {
size_t len;

len = n <= threshold ? n : threshold;

if (!is_memcpy && is_overlapping_buffers(dest, src, len))
cpu_size = 0;
else
if (!is_overlapping)
cpu_size = len * current_cpu_size_fraction / 100;

dsa_size = len - cpu_size;
Expand All @@ -1541,30 +1579,36 @@ static void dto_memcpymove(void *dest, const void *src, size_t n, bool is_memcpy
thr_desc.dst_addr = (uint64_t) dest + cpu_size + thr_bytes_completed;
thr_desc.xfer_size = (uint32_t) dsa_size;
thr_comp.status = 0;
*result = dsa_submit(wq, &thr_desc);
if (*result == SUCCESS) {
if (cpu_size) {
const void *src1 = src + thr_bytes_completed;
void *dest1 = dest + thr_bytes_completed;

if (is_memcpy)
orig_memcpy(dest1, src1, cpu_size);
else
orig_memmove(dest1, src1, cpu_size);
thr_bytes_completed += cpu_size;
if (is_overlapping){
*result = dsa_execute(wq, &thr_desc, &thr_comp.status);
} else {
*result = dsa_submit(wq, &thr_desc);
if (*result == SUCCESS) {
if (cpu_size) {
const void *src1 = src + thr_bytes_completed;
void *dest1 = dest + thr_bytes_completed;

if (is_memcpy)
orig_memcpy(dest1, src1, cpu_size);
else
orig_memmove(dest1, src1, cpu_size);
thr_bytes_completed += cpu_size;
}
*result = dsa_wait(wq, &thr_desc, &thr_comp.status);
}
*result = dsa_wait(wq, &thr_desc, &thr_comp.status);
}

if (*result != SUCCESS)
break;
n -= len;
/* If remaining bytes are less than dsa_min_size,
* dont submit to DSA. Instead, complete remaining
* bytes on CPU
*/
* dont submit to DSA. Instead, complete remaining
* bytes on CPU
*/
} while (n >= dsa_min_size);
}

return is_overlapping;
}

static int dto_memcmp(const void *s1, const void *s2, size_t n, int *result)
Expand Down Expand Up @@ -1694,7 +1738,7 @@ void *memset(void *s1, int c, size_t n)
dto_memset(s1, c, n, &result);

#ifdef DTO_STATS_SUPPORT
DTO_COLLECT_STATS_DSA_END(collect_stats, st, et, MEMSET, n, thr_bytes_completed, result);
DTO_COLLECT_STATS_DSA_END(collect_stats, st, et, MEMSET, n, false, thr_bytes_completed, result);
#endif
if (thr_bytes_completed != n) {
/* fallback to std call if job is only partially completed */
Expand Down Expand Up @@ -1744,7 +1788,7 @@ void *memcpy(void *dest, const void *src, size_t n)
dto_memcpymove(dest, src, n, 1, &result);

#ifdef DTO_STATS_SUPPORT
DTO_COLLECT_STATS_DSA_END(collect_stats, st, et, MEMCOPY, n, thr_bytes_completed, result);
DTO_COLLECT_STATS_DSA_END(collect_stats, st, et, MEMCOPY, n, false, thr_bytes_completed, result);
#endif
if (thr_bytes_completed != n) {
/* fallback to std call if job is only partially completed */
Expand Down Expand Up @@ -1776,6 +1820,7 @@ void *memmove(void *dest, const void *src, size_t n)
int result = 0;
void *ret = dest;
int use_orig_func = USE_ORIG_FUNC(n, dto_dsa_memmove);
bool is_overlapping;
#ifdef DTO_STATS_SUPPORT
struct timespec st, et;
size_t orig_n = n;
Expand All @@ -1794,10 +1839,10 @@ void *memmove(void *dest, const void *src, size_t n)
#ifdef DTO_STATS_SUPPORT
DTO_COLLECT_STATS_START(collect_stats, st);
#endif
dto_memcpymove(dest, src, n, 0, &result);
is_overlapping = dto_memcpymove(dest, src, n, 0, &result);

#ifdef DTO_STATS_SUPPORT
DTO_COLLECT_STATS_DSA_END(collect_stats, st, et, MEMMOVE, n, thr_bytes_completed, result);
DTO_COLLECT_STATS_DSA_END(collect_stats, st, et, MEMMOVE, n, is_overlapping, thr_bytes_completed, result);
#endif
if (thr_bytes_completed != n) {
/* fallback to std call if job is only partially completed */
Expand Down Expand Up @@ -1850,7 +1895,7 @@ int memcmp(const void *s1, const void *s2, size_t n)
ret = dto_memcmp(s1, s2, n, &result);

#ifdef DTO_STATS_SUPPORT
DTO_COLLECT_STATS_DSA_END(collect_stats, st, et, MEMCMP, n, thr_bytes_completed, result);
DTO_COLLECT_STATS_DSA_END(collect_stats, st, et, MEMCMP, n, false, thr_bytes_completed, result);
#endif
if (thr_bytes_completed != n) {
/* fallback to std call if job is only partially completed */
Expand Down