diff --git a/HOWTO b/HOWTO index 8c9e41356b..4d0abd401a 100644 --- a/HOWTO +++ b/HOWTO @@ -1944,6 +1944,7 @@ I/O engine **posixaio** POSIX asynchronous I/O using :manpage:`aio_read(3)` and :manpage:`aio_write(3)`. + This engine defines engine specific options. **solarisaio** Solaris native asynchronous I/O. @@ -2642,6 +2643,22 @@ with the caveat that when used on the command line, they must come after the If set, stdout and stderr streams are redirected to files named from the job name. Default is true. +.. options:: posixaio_respect_iodepth_batch_complete_max=bool : [posixaio] + + If set, limit batch completions according to + :option:`iodepth_batch_complete_max`, as other engines do. Default is + false, effectively behaving as though + :option:`iodepth_batch_complete_max` has the same value as + :option:`iodepth`. + Only applies to wait=aio_suspend, as other options already + respect :option:`iodepth_batch_complete_max`. + +.. options:: posixaio_wait=str : [posixaio] + + Selects the mechanism used for waiting for I/Os to complete. + Default is aio_suspend. On FreeBSD, aio_waitcomplete may be used. + + I/O depth ~~~~~~~~~ diff --git a/configure b/configure index 84ccce040e..7fc0b8b596 100755 --- a/configure +++ b/configure @@ -737,6 +737,26 @@ EOF fi print_config "POSIX AIO fsync" "$posix_aio_fsync" +########################################## +# aio_waitcomplete probe +if test "have_aio_waitcomplete" != "yes" ; then + have_aio_waitcomplete="no" +fi +cat > $TMPC < +#include +int main(void) +{ + struct aiocb *cb; + aio_waitcomplete(&cb, NULL); + return 0; +} +EOF +if compile_prog "" "" "aio_waitcomplete" ; then + have_aio_waitcomplete="yes" +fi +print_config "aio_waitcomplete()" "$have_aio_waitcomplete" + ########################################## # POSIX pshared attribute probe if test "$posix_pshared" != "yes" ; then @@ -2858,6 +2878,9 @@ fi if test "$posix_aio_fsync" = "yes" ; then output_sym "CONFIG_POSIXAIO_FSYNC" fi +if test "$have_aio_waitcomplete" = "yes" ; then + output_sym "CONFIG_HAVE_AIO_WAITCOMPLETE" +fi if test "$posix_pshared" = "yes" ; then output_sym "CONFIG_PSHARED" fi diff --git a/engines/posixaio.c b/engines/posixaio.c index 135d088c7a..1e06bac1b1 100644 --- a/engines/posixaio.c +++ b/engines/posixaio.c @@ -11,10 +11,61 @@ #include #include "../fio.h" +#include "../optgroup.h" + +enum { + FIO_POSIXAIO_SUSPEND, + FIO_POSIXAIO_WAITCOMPLETE, +}; struct posixaio_data { struct io_u **aio_events; unsigned int queued; + int (*getevents)(struct thread_data *, unsigned int, unsigned int, const struct timespec *); +}; + +struct posixaio_options { + void *pad; + unsigned int respect_iodepth_batch_complete_max; + unsigned int wait; +}; + +static struct fio_option options[] = { + { + .name = "posixaio_respect_iodepth_batch_complete_max", + .lname = "Respect iodepth_batch_complete_max for wait=aio_suspend", + .type = FIO_OPT_BOOL, + .off1 = offsetof(struct posixaio_options, respect_iodepth_batch_complete_max), + .help = "Whether to cap batch completion for wait=aio_suspend", + .def = "0", + .category = FIO_OPT_C_ENGINE, + .group = FIO_OPT_G_POSIXAIO, + }, + { + .name = "posixaio_wait", + .lname = "POSIX AIO wait mechanism", + .type = FIO_OPT_STR, + .off1 = offsetof(struct posixaio_options, wait), + .help = "Select mechanism for waiting for I/O completion", + .def = "aio_suspend", + .posval = { + { .ival = "aio_suspend", + .oval = FIO_POSIXAIO_SUSPEND, + .help = "Use aio_suspend()", + }, +#ifdef CONFIG_HAVE_AIO_WAITCOMPLETE + { .ival = "aio_waitcomplete", + .oval = FIO_POSIXAIO_WAITCOMPLETE, + .help = "Use aio_waitcomplete()", + }, +#endif + }, + .category = FIO_OPT_C_ENGINE, + .group = FIO_OPT_G_POSIXAIO, + }, + { + .name = NULL, + }, }; static unsigned long long ts_utime_since_now(const struct timespec *start) @@ -55,12 +106,68 @@ static int fio_posixaio_prep(struct thread_data fio_unused *td, return 0; } +#ifdef CONFIG_HAVE_AIO_WAITCOMPLETE + +static int fio_posixaio_getevents_waitcomplete(struct thread_data *td, + unsigned int min, + unsigned int max, + const struct timespec *t) +{ + struct posixaio_data *pd = td->io_ops_data; + struct aiocb *aiocb; + struct io_u *io_u; + ssize_t retval; + unsigned int events = 0; + struct timespec zero_timeout = {0}; + struct timespec *timeout; + + do + { + if (events < min) { + /* Wait until the minimum is satisfied. */ + timeout = (struct timespec *)t; + } else { + /* Consume as many more as we can without waiting. */ + timeout = &zero_timeout; + } + + retval = aio_waitcomplete(&aiocb, timeout); + if (retval < 0) { + if (errno == EINTR) + continue; + if (errno == EAGAIN) + break; + td_verror(td, errno, "aio_waitcomplete"); + break; + } + + io_u = container_of(aiocb, struct io_u, aiocb); + pd->queued--; + pd->aio_events[events++] = io_u; + + if (retval >= 0) + io_u->resid = io_u->xfer_buflen - retval; + else if (errno == ECANCELED) + io_u->resid = io_u->xfer_buflen; + else + io_u->error = errno; + + } while (events < max && pd->queued > 0); + + return events; +} + +#endif + #define SUSPEND_ENTRIES 8 -static int fio_posixaio_getevents(struct thread_data *td, unsigned int min, - unsigned int max, const struct timespec *t) +static int fio_posixaio_getevents_suspend(struct thread_data *td, + unsigned int min, + unsigned int max, + const struct timespec *t) { struct posixaio_data *pd = td->io_ops_data; + struct posixaio_options *o = td->eo; os_aiocb_t *suspend_list[SUSPEND_ENTRIES]; struct timespec start; int have_timeout = 0; @@ -105,6 +212,9 @@ static int fio_posixaio_getevents(struct thread_data *td, unsigned int min, io_u->resid = io_u->xfer_buflen - retval; } else io_u->error = err; + + if (o->respect_iodepth_batch_complete_max && r >= max) + break; } if (r >= min) @@ -126,6 +236,16 @@ static int fio_posixaio_getevents(struct thread_data *td, unsigned int min, goto restart; } +static int fio_posixaio_getevents(struct thread_data *td, + unsigned int min, + unsigned int max, + const struct timespec *t) +{ + struct posixaio_data *pd = td->io_ops_data; + + return pd->getevents(td, min, max, t); +} + static struct io_u *fio_posixaio_event(struct thread_data *td, int event) { struct posixaio_data *pd = td->io_ops_data; @@ -197,13 +317,29 @@ static void fio_posixaio_cleanup(struct thread_data *td) static int fio_posixaio_init(struct thread_data *td) { + struct posixaio_options *o = td->eo; struct posixaio_data *pd = malloc(sizeof(*pd)); memset(pd, 0, sizeof(*pd)); pd->aio_events = malloc(td->o.iodepth * sizeof(struct io_u *)); memset(pd->aio_events, 0, td->o.iodepth * sizeof(struct io_u *)); + switch (o->wait) { + case FIO_POSIXAIO_SUSPEND: + pd->getevents = fio_posixaio_getevents_suspend; + break; +#ifdef CONFIG_HAVE_AIO_WAITCOMPLETE + case FIO_POSIXAIO_WAITCOMPLETE: + pd->getevents = fio_posixaio_getevents_waitcomplete; + break; +#endif + default: + free(pd); + return -1; + } + td->io_ops_data = pd; + return 0; } @@ -221,6 +357,8 @@ static struct ioengine_ops ioengine = { .open_file = generic_open_file, .close_file = generic_close_file, .get_file_size = generic_get_file_size, + .options = options, + .option_struct_size = sizeof(struct posixaio_options), }; static void fio_init fio_posixaio_register(void) diff --git a/fio.1 b/fio.1 index a3ebb67d36..e182449a1f 100644 --- a/fio.1 +++ b/fio.1 @@ -2397,6 +2397,13 @@ Defines the time between the SIGTERM and SIGKILL signals. Default is 1 second. .TP .BI (exec)std_redirect\fR=\fbool If set, stdout and stderr streams are redirected to files named from the job name. Default is true. +.TP +.BI (posixaio)posixaio_respect_iodepth_batch_complete_max\fR=\fPbool +If set, limit batch completions according to +\fBiodepth_batch_complete_max\fR, as other engines do. Default is +false, effectively setting +\fBiodepth_batch_complete_max\fR to the same value as +\fBiodepth\fR. .SS "I/O depth" .TP .BI iodepth \fR=\fPint diff --git a/optgroup.h b/optgroup.h index 1fb84a296b..af6bf81e1c 100644 --- a/optgroup.h +++ b/optgroup.h @@ -71,6 +71,7 @@ enum opt_category_group { __FIO_OPT_G_LIBCUFILE, __FIO_OPT_G_DFS, __FIO_OPT_G_NFS, + __FIO_OPT_G_POSIXAIO, FIO_OPT_G_RATE = (1ULL << __FIO_OPT_G_RATE), FIO_OPT_G_ZONE = (1ULL << __FIO_OPT_G_ZONE), @@ -116,6 +117,7 @@ enum opt_category_group { FIO_OPT_G_FILESTAT = (1ULL << __FIO_OPT_G_FILESTAT), FIO_OPT_G_LIBCUFILE = (1ULL << __FIO_OPT_G_LIBCUFILE), FIO_OPT_G_DFS = (1ULL << __FIO_OPT_G_DFS), + FIO_OPT_G_POSIXAIO = (1ULL << __FIO_OPT_G_POSIXAIO), }; extern const struct opt_group *opt_group_from_mask(uint64_t *mask);