Skip to content

Commit 3f0a9db

Browse files
committed
vxdna: Add contiguous heap VA and error handling for QEMU vxdna
Add expandable device heap support in vxdna (QEMU virtio-accel backend) with contiguous virtual address management and robust error handling. Heap VA management: - Reserve a contiguous HEAP_MAX_SIZE (512MB) VA range with PROT_NONE on first DEV_HEAP BO creation. Use MAP_FIXED to place each expansion chunk at m_heap_base + m_heap_cur_offset, ensuring the guest kernel sees a contiguous UVA space matching its expectations. - Track heap state with m_heap_destroyed flag: once any DEV_HEAP BO is destroyed, the contiguous VA invariant is broken so all further DEV/DEV_HEAP allocations are rejected with -EINVAL. - Add bounds check: reject expansion if m_heap_cur_offset + size exceeds HEAP_MAX_SIZE with -ENOSPC. - Move m_heap_cur_offset update after add_bo() and response write to avoid permanently wasting VA budget on failed operations. Error handling: - vxdna_ccmd_error_wrap: Remove exception re-throwing after catch. Re-throwing caused QEMU to send 0x1200 error responses and skip fence signaling, hanging the guest indefinitely on sync_wait. Now errors are logged and written to the response buffer only. - write_err_rsp: Log and return instead of throwing when resp resource is missing, preventing exceptions from escaping the error wrapper. - vxdna_bo constructor: Add mmap_target parameter to support MAP_FIXED placement. Add get_type() accessor for BO type queries. Cleanup: - Clean up reserved VA range in vxdna_context destructor. - Update Doxygen for vxdna_ccmd_error_wrap to reflect non-re-throwing behavior. Signed-off-by: Wendy Liang <wendy.liang@amd.com>
1 parent 5be2b21 commit 3f0a9db

File tree

2 files changed

+86
-33
lines changed

2 files changed

+86
-33
lines changed

src/vxdna/src/vaccel_amdxdna.cpp

Lines changed: 53 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,8 @@ vxdna_bo(int ctx_fd_in, const struct amdxdna_ccmd_create_bo_req *req)
7171
vxdna_dbg("Create bo: ctx_fd=%d, type=%d, size=%lu", m_ctx_fd, m_bo_type, m_size);
7272
ret = ioctl(m_ctx_fd, DRM_IOCTL_AMDXDNA_CREATE_BO, &args);
7373
if (ret)
74-
VACCEL_THROW_MSG(-errno, "Create bo failed ret %d", ret);
74+
VACCEL_THROW_MSG(-errno, "Create bo failed ret %d, errno %d, %s",
75+
ret, errno, strerror(errno));
7576

7677
m_bo_handle = args.handle;
7778
bo_info.handle = m_bo_handle;
@@ -90,7 +91,8 @@ vxdna_bo(int ctx_fd_in, const struct amdxdna_ccmd_create_bo_req *req)
9091

9192
vxdna_bo::
9293
vxdna_bo(const std::shared_ptr<vaccel_resource> &res, int ctx_fd_in,
93-
const struct amdxdna_ccmd_create_bo_req *req)
94+
const struct amdxdna_ccmd_create_bo_req *req,
95+
void *mmap_target)
9496
: m_opaque_handle(res->get_opaque_handle())
9597
{
9698
struct amdxdna_drm_get_bo_info bo_info = {};
@@ -171,18 +173,18 @@ vxdna_bo(const std::shared_ptr<vaccel_resource> &res, int ctx_fd_in,
171173

172174
vxdna_dbg("mmap is required for handle: res_id=%u, handle=%u, opaque_handle=%d, vaddr=%lx, xdna_addr=%lx",
173175
res->get_res_id(), m_bo_handle, res->get_opaque_handle(), m_vaddr, m_xdna_addr);
174-
// mmap is required for non-dev BOs
175176
uint64_t resv_vaddr = 0, resv_size = 0, va_to_map = 0;
176177
void *resv_va = nullptr;
177178
int flags = MAP_SHARED | MAP_LOCKED;
178-
if (req->map_align) {
179+
if (mmap_target) {
180+
va_to_map = reinterpret_cast<uint64_t>(mmap_target);
181+
flags |= MAP_FIXED;
182+
} else if (req->map_align) {
179183
resv_va = ::mmap(0, m_map_size + req->map_align, PROT_READ | PROT_WRITE,
180184
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
181185
if (resv_va == MAP_FAILED) {
182-
// Clean up if we created the BO
183-
if (has_created_bo) {
186+
if (has_created_bo)
184187
close_gem_handle(m_ctx_fd, m_bo_handle);
185-
}
186188
VACCEL_THROW_MSG(-ENOMEM, "Reserve vaddr range failed, map_align=%zu", req->map_align);
187189
}
188190

@@ -198,22 +200,21 @@ vxdna_bo(const std::shared_ptr<vaccel_resource> &res, int ctx_fd_in,
198200
int saved_errno = errno;
199201
if (resv_va && resv_va != MAP_FAILED)
200202
::munmap(resv_va, resv_size);
201-
// Clean up if we created the BO
202-
if (has_created_bo) {
203+
if (has_created_bo)
203204
close_gem_handle(m_ctx_fd, m_bo_handle);
204-
}
205205
VACCEL_THROW_MSG(-saved_errno,
206206
"Map bo failed, errno %d, %s, to map startaddr 0x%lx, map_offset 0x%lx, map_size 0x%lx",
207207
saved_errno, strerror(saved_errno), va_to_map, m_map_offset, m_map_size);
208208
}
209209
m_vaddr = reinterpret_cast<uint64_t>(va);
210210

211-
if (req->map_align && m_vaddr > resv_vaddr)
211+
if (!mmap_target && req->map_align && m_vaddr > resv_vaddr)
212212
::munmap(resv_va, static_cast<size_t>(m_vaddr - resv_vaddr));
213-
if (resv_vaddr + resv_size > m_vaddr + m_map_size)
213+
if (!mmap_target && resv_vaddr + resv_size > m_vaddr + m_map_size)
214214
munmap(reinterpret_cast<void *>(m_vaddr + m_map_size),
215215
static_cast<size_t>(resv_vaddr + resv_size - m_vaddr - m_map_size));
216-
vxdna_dbg("Created BO with resource: type=%u, res_id=%u", req->bo_type, req->res_id);
216+
vxdna_dbg("Created BO with resource: type=%u, res_id=%u, vaddr=0x%lx",
217+
req->bo_type, req->res_id, m_vaddr);
217218
}
218219

219220
vxdna_bo::
@@ -420,12 +421,39 @@ void
420421
vxdna_context::
421422
create_bo(const struct amdxdna_ccmd_create_bo_req *req)
422423
{
424+
if (m_heap_destroyed &&
425+
(req->bo_type == AMDXDNA_BO_DEV || req->bo_type == AMDXDNA_BO_DEV_HEAP))
426+
VACCEL_THROW_MSG(-EINVAL, "Heap destroyed, cannot allocate type %u", req->bo_type);
427+
423428
std::shared_ptr<vxdna_bo> xdna_bo;
424429
if (req->bo_type != AMDXDNA_BO_DEV) {
425430
auto res = get_device().get_resource(req->res_id);
426431
if (!res)
427432
VACCEL_THROW_MSG(-EINVAL, "Res: %u not found", req->res_id);
428-
xdna_bo = std::make_shared<vxdna_bo>(res, get_fd(), req);
433+
434+
void *mmap_target = nullptr;
435+
if (req->bo_type == AMDXDNA_BO_DEV_HEAP) {
436+
if (!m_heap_base) {
437+
m_heap_base = ::mmap(0, HEAP_MAX_SIZE, PROT_NONE,
438+
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
439+
if (m_heap_base == MAP_FAILED) {
440+
m_heap_base = nullptr;
441+
VACCEL_THROW_MSG(-ENOMEM,
442+
"Failed to reserve heap VA range (%zu bytes)",
443+
HEAP_MAX_SIZE);
444+
}
445+
vxdna_dbg("Reserved heap VA range: base=%p, size=0x%zx",
446+
m_heap_base, HEAP_MAX_SIZE);
447+
}
448+
if (m_heap_cur_offset + req->size > HEAP_MAX_SIZE)
449+
VACCEL_THROW_MSG(-ENOSPC,
450+
"Heap expansion exceeds max size (%zu + %lu > %zu)",
451+
m_heap_cur_offset, (unsigned long)req->size,
452+
HEAP_MAX_SIZE);
453+
mmap_target = static_cast<char *>(m_heap_base) + m_heap_cur_offset;
454+
}
455+
456+
xdna_bo = std::make_shared<vxdna_bo>(res, get_fd(), req, mmap_target);
429457
} else {
430458
xdna_bo = std::make_shared<vxdna_bo>(get_fd(), req);
431459
}
@@ -440,6 +468,9 @@ create_bo(const struct amdxdna_ccmd_create_bo_req *req)
440468
VACCEL_THROW_MSG(-EINVAL, "Resp resource not found for context %u", get_id());
441469
(void)resp_res->write(req->hdr.rsp_off, &rsp, sizeof(rsp));
442470
add_bo(std::move(xdna_bo));
471+
472+
if (req->bo_type == AMDXDNA_BO_DEV_HEAP)
473+
m_heap_cur_offset += req->size;
443474
vxdna_dbg("Created bo: handle=%u, xdna_addr=%lu", rsp.handle, rsp.xdna_addr);
444475
}
445476

@@ -454,6 +485,9 @@ void
454485
vxdna_context::
455486
remove_bo(uint32_t handle)
456487
{
488+
auto bo = m_bo_table.lookup(handle);
489+
if (bo && bo->get_type() == AMDXDNA_BO_DEV_HEAP)
490+
m_heap_destroyed = true;
457491
vxdna_dbg("Removing bo: handle=%u", handle);
458492
m_bo_table.erase(handle);
459493
}
@@ -674,8 +708,11 @@ vxdna_context::
674708
write_err_rsp(int err)
675709
{
676710
auto resp_res = get_resp_res();
677-
if (!resp_res)
678-
VACCEL_THROW_MSG(-EINVAL, "Resp resource not found for context %u", get_id());
711+
if (!resp_res) {
712+
vxdna_err("write_err_rsp: no resp resource for ctx %u, err %d dropped",
713+
get_id(), err);
714+
return;
715+
}
679716
struct amdxdna_ccmd_rsp rsp = {};
680717
rsp.ret = err;
681718
rsp.base.len = sizeof(rsp);

src/vxdna/src/vaccel_amdxdna.h

Lines changed: 33 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -16,13 +16,15 @@
1616
#include <stddef.h>
1717
#include <stdexcept>
1818
#include <memory>
19+
#include <sys/mman.h>
1920

2021
#include "drm_hw.h" // from xdna shim virtio
2122

2223
#include "drm_local/amdxdna_accel.h"
2324
#include "amdxdna_proto.h"
2425
#include "vaccel.h"
2526
#include "vaccel_internal.h"
27+
#include "../util/vxdna_debug.h"
2628

2729
/**
2830
* @brief AMDXDNA Buffer Object wrapper
@@ -50,7 +52,9 @@ class vxdna_bo {
5052
* @param req BO creation request with type, size, and alignment
5153
* @throws vaccel_error on DRM ioctl failure
5254
*/
53-
vxdna_bo(const std::shared_ptr<vaccel_resource> &res, int ctx_fd_in, const struct amdxdna_ccmd_create_bo_req *req);
55+
vxdna_bo(const std::shared_ptr<vaccel_resource> &res, int ctx_fd_in,
56+
const struct amdxdna_ccmd_create_bo_req *req,
57+
void *mmap_target = nullptr);
5458

5559
/**
5660
* @brief Construct device-local BO
@@ -100,6 +104,15 @@ class vxdna_bo {
100104
return m_bo_handle;
101105
}
102106

107+
/**
108+
* @brief Get the BO type
109+
* @return BO type (AMDXDNA_BO_DEV, AMDXDNA_BO_DEV_HEAP, etc.)
110+
*/
111+
uint32_t get_type() const noexcept
112+
{
113+
return m_bo_type;
114+
}
115+
103116
private:
104117
uint64_t m_size = 0; /**< Buffer size in bytes */
105118
uint64_t m_vaddr = 0; /**< Virtual address (after mmap) */
@@ -156,6 +169,9 @@ class vxdna_context : public vaccel_context<vxdna_context, vxdna> {
156169
*/
157170
~vxdna_context() {
158171
vxdna_dbg("Context destroying: ctx_id=%u, fd=%d", get_id(), get_fd());
172+
m_bo_table.clear();
173+
if (m_heap_base)
174+
::munmap(m_heap_base, HEAP_MAX_SIZE);
159175
close(get_fd());
160176
}
161177

@@ -493,6 +509,11 @@ class vxdna_context : public vaccel_context<vxdna_context, vxdna> {
493509
std::shared_ptr<vaccel_resource> m_resp_res;
494510
vaccel_map<uint32_t, std::shared_ptr<vxdna_bo>> m_bo_table;
495511
vaccel_map<uint32_t, std::shared_ptr<vxdna_hwctx>> m_hwctx_table;
512+
513+
static constexpr size_t HEAP_MAX_SIZE = 512UL << 20;
514+
void *m_heap_base = nullptr;
515+
size_t m_heap_cur_offset = 0;
516+
bool m_heap_destroyed = false;
496517
};
497518

498519
/**
@@ -711,41 +732,36 @@ int vxdna_device_process_ccmd(void *cookie, const void *cmd_buf, size_t buf_size
711732
* @brief Exception wrapper for ccmd handlers
712733
*
713734
* Wraps a ccmd handler function to catch exceptions and write
714-
* appropriate error responses to the context's response buffer
715-
* before re-throwing.
716-
*
717-
* Usage:
718-
* @code
719-
* vxdna_ccmd_error_wrap(ctx, [&]() {
720-
* ctx->create_bo(req);
721-
* });
722-
* @endcode
735+
* appropriate error responses to the context's response buffer.
736+
* Exceptions are NOT re-thrown so that QEMU receives a zero return
737+
* from vaccel_submit_ccmd and signals the fence normally. Re-throwing
738+
* would cause QEMU to send 0x1200 error responses which cascades
739+
* into resource lookup failures for subsequent commands.
723740
*
724741
* Error handling:
725-
* - vaccel_error: Writes e.code() to response, re-throws
726-
* - std::exception: Writes -EIO to response, re-throws
727-
* - Unknown: Writes -EIO to response, throws std::runtime_error
742+
* - vaccel_error: Logs error, writes e.code() to response
743+
* - std::exception: Logs error, writes -EIO to response
744+
* - Unknown: Logs error, writes -EIO to response
728745
*
729746
* @tparam ContextType Context type (must have write_err_rsp method)
730747
* @tparam F Callable type (lambda, function, etc.)
731748
* @param ctx Context to write error response to
732749
* @param f Handler function to execute
733-
* @throws Re-throws caught exceptions after writing response
734750
*/
735751
template<typename ContextType, typename F> void
736752
vxdna_ccmd_error_wrap(const std::shared_ptr<ContextType> &ctx, F &&f)
737753
{
738754
try {
739755
f();
740756
} catch (const vaccel_error& e) {
757+
vxdna_err("ccmd failed: %s", e.what());
741758
ctx->write_err_rsp(e.code());
742-
throw e;
743759
} catch (const std::exception& e) {
760+
vxdna_err("ccmd failed (unexpected): %s", e.what());
744761
ctx->write_err_rsp(-EIO);
745-
throw e;
746762
} catch (...) {
763+
vxdna_err("ccmd failed (unknown exception)");
747764
ctx->write_err_rsp(-EIO);
748-
throw std::runtime_error("Unknown exception");
749765
}
750766
}
751767

0 commit comments

Comments
 (0)