@article{10.1145/3643886, author = {Dang, Zheng and He, Shuibing and Zhang, Xuechen and Hong, Peiyi and Li, Zhenxin and Chen, Xinyu and Song, Haozhe and Sun, Xian-He and Chen, Gang}, title = {PMAlloc: A Holistic Approach to Improving Persistent Memory Allocation}, year = {2024}, issue_date = {November 2024}, publisher = {Association for Computing Machinery}, address = {New York, NY, USA}, volume = {42}, number = {3–4}, issn = {0734-2071}, url = {https://doi.org/10.1145/3643886}, doi = {10.1145/3643886}, abstract = {Persistent memory allocation is a fundamental building block for developing high-performance and in-memory applications. Existing persistent memory allocators suffer from many performance issues. First, they may introduce repeated cache line flushes and small random accesses in persistent memory for their poor heap metadata management. Second, they use static slab segregation resulting in a dramatic increase in memory consumption when allocation request size is changed. Third, they are not aware of NUMA effect, leading to remote persistent memory accesses in memory allocation and deallocation processes. In this article, we design a novel allocator, named PMAlloc, to solve the above issues simultaneously. (1) PMAlloc eliminates cache line reflushes by mapping contiguous data blocks in slabs to interleaved metadata entries stored in different cache lines. (2) It writes small metadata units to a persistent bookkeeping log in a sequential pattern to remove random heap metadata accesses in persistent memory. (3) Instead of using static slab segregation, it supports slab morphing, which allows slabs to be transformed between size classes to significantly improve slab usage. (4) It uses a local-first allocation policy to avoid allocating remote memory blocks. And it supports a two-phase deallocation mechanism including recording and synchronization to minimize the number of remote memory access in the deallocation. PMAlloc is complementary to the existing consistency models. Results on six benchmarks demonstrate that PMAlloc improves the performance of state-of-the-art persistent memory allocators by up to 6.4\texttimes{} and 57\texttimes{} for small and large allocations, respectively. PMAlloc with NUMA optimizations brings a 2.9\texttimes{} speedup in multi-socket evaluation and is up to 36\texttimes{} faster than other persistent memory allocators. Using PMAlloc reduces memory usage by up to 57.8\%. Besides, we integrate PMAlloc in a persistent FPTree. Compared to the state-of-the-art allocators, PMAlloc improves the performance of this application by up to 3.1\texttimes{}.}, journal = {ACM Trans. Comput. Syst.}, month = sep, articleno = {7}, numpages = {52}, keywords = {Dynamic memory allocation, persistent memory, memory fragmentation, non-uniform memory access} }