public inbox for gentoo-commits@lists.gentoo.org
 help / color / mirror / Atom feed
* [gentoo-commits] proj/portage:master commit in: lib/portage/util/, lib/portage/util/file_copy/, src/
@ 2024-03-15 20:06 Mike Gilbert
  0 siblings, 0 replies; only message in thread
From: Mike Gilbert @ 2024-03-15 20:06 UTC (permalink / raw
  To: gentoo-commits

commit:     23529ee81964665107400e87fc3d49c256e915c0
Author:     Mike Gilbert <floppym <AT> gentoo <DOT> org>
AuthorDate: Fri Mar  1 15:45:58 2024 +0000
Commit:     Mike Gilbert <floppym <AT> gentoo <DOT> org>
CommitDate: Fri Mar 15 20:05:34 2024 +0000
URL:        https://gitweb.gentoo.org/proj/portage.git/commit/?id=23529ee8

Replace linux_reflink extension module

Python 3.8 added support for copy_file_range in the os module,
so we can just call that directly.

Also, we can use the FICLONE ioctl for fast file clones on supported
filesystems (btrfs).

Signed-off-by: Mike Gilbert <floppym <AT> gentoo.org>

 lib/portage/util/file_copy.py              | 137 ++++++++++
 lib/portage/util/file_copy/__init__.py     |  36 ---
 lib/portage/util/file_copy/meson.build     |   7 -
 lib/portage/util/meson.build               |   2 +-
 src/meson.build                            |  20 --
 src/portage_util_file_copy_reflink_linux.c | 396 -----------------------------
 6 files changed, 138 insertions(+), 460 deletions(-)

diff --git a/lib/portage/util/file_copy.py b/lib/portage/util/file_copy.py
new file mode 100644
index 0000000000..e3926d8ef6
--- /dev/null
+++ b/lib/portage/util/file_copy.py
@@ -0,0 +1,137 @@
+# Copyright 2024 Gentoo Authors
+# Distributed under the terms of the GNU General Public License v2
+
+import errno
+import fcntl
+import logging
+import os
+import platform
+import shutil
+import sys
+
+
+logger = logging.getLogger(__name__)
+
+# Added in Python 3.12
+FICLONE = getattr(fcntl, "FICLONE", 0x40049409)
+
+# Unavailable in PyPy
+SEEK_DATA = getattr(os, "SEEK_DATA", 3)
+SEEK_HOLE = getattr(os, "SEEK_HOLE", 4)
+
+
+def _get_chunks(src):
+    try:
+        offset_hole = 0
+        while True:
+            try:
+                # Find the next bit of data
+                offset_data = os.lseek(src, offset_hole, SEEK_DATA)
+            except OSError as e:
+                # Re-raise for unexpected errno values
+                if e.errno not in (errno.EINVAL, errno.ENXIO):
+                    raise
+
+                offset_end = os.lseek(src, 0, os.SEEK_END)
+
+                if e.errno == errno.ENXIO:
+                    # End of file
+                    if offset_end > offset_hole:
+                        # Hole at end of file
+                        yield (offset_end, 0)
+                else:
+                    # SEEK_DATA failed with EINVAL, return the whole file
+                    yield (0, offset_end)
+
+                break
+            else:
+                offset_hole = os.lseek(src, offset_data, SEEK_HOLE)
+                yield (offset_data, offset_hole - offset_data)
+
+    except OSError:
+        logger.warning("_get_chunks failed unexpectedly", exc_info=sys.exc_info())
+        raise
+
+
+def _do_copy_file_range(src, dst, offset, count):
+    while count > 0:
+        # count must fit in ssize_t
+        c = min(count, sys.maxsize)
+        written = os.copy_file_range(src, dst, c, offset, offset)
+        if written == 0:
+            # https://bugs.gentoo.org/828844
+            raise OSError(errno.EOPNOTSUPP, os.strerror(errno.EOPNOTSUPP))
+        offset += written
+        count -= written
+
+
+def _do_sendfile(src, dst, offset, count):
+    os.lseek(dst, offset, os.SEEK_SET)
+    while count > 0:
+        # count must fit in ssize_t
+        c = min(count, sys.maxsize)
+        written = os.sendfile(dst, src, offset, c)
+        offset += written
+        count -= written
+
+
+def _fastcopy(src, dst):
+    with (
+        open(src, "rb", buffering=0) as srcf,
+        open(dst, "wb", buffering=0) as dstf,
+    ):
+        srcfd = srcf.fileno()
+        dstfd = dstf.fileno()
+
+        if platform.system() == "Linux":
+            try:
+                fcntl.ioctl(dstfd, FICLONE, srcfd)
+                return
+            except OSError:
+                pass
+
+        try_cfr = hasattr(os, "copy_file_range")
+
+        for offset, count in _get_chunks(srcfd):
+            if count == 0:
+                os.ftruncate(dstfd, offset)
+            else:
+                if try_cfr:
+                    try:
+                        _do_copy_file_range(srcfd, dstfd, offset, count)
+                        continue
+                    except OSError as e:
+                        try_cfr = False
+                        if e.errno not in (errno.EXDEV, errno.ENOSYS, errno.EOPNOTSUPP):
+                            logger.warning(
+                                "_do_copy_file_range failed unexpectedly",
+                                exc_info=sys.exc_info(),
+                            )
+                try:
+                    _do_sendfile(srcfd, dstfd, offset, count)
+                except OSError:
+                    logger.warning(
+                        "_do_sendfile failed unexpectedly", exc_info=sys.exc_info()
+                    )
+                    raise
+
+
+def copyfile(src, dst):
+    """
+    Copy the contents (no metadata) of the file named src to a file
+    named dst.
+
+    If possible, copying is done within the kernel, and uses
+    "copy acceleration" techniques (such as reflinks). This also
+    supports sparse files.
+
+    @param src: path of source file
+    @type src: str
+    @param dst: path of destination file
+    @type dst: str
+    """
+
+    try:
+        _fastcopy(src, dst)
+    except OSError:
+        shutil.copyfile(src, dst)

diff --git a/lib/portage/util/file_copy/__init__.py b/lib/portage/util/file_copy/__init__.py
deleted file mode 100644
index f88d4d9d59..0000000000
--- a/lib/portage/util/file_copy/__init__.py
+++ /dev/null
@@ -1,36 +0,0 @@
-# Copyright 2017 Gentoo Foundation
-# Distributed under the terms of the GNU General Public License v2
-
-import shutil
-
-try:
-    from portage.util.file_copy.reflink_linux import file_copy as _file_copy
-except ImportError:
-    _file_copy = None
-
-
-def _optimized_copyfile(src, dst):
-    """
-    Copy the contents (no metadata) of the file named src to a file
-    named dst.
-
-    If possible, copying is done within the kernel, and uses
-    "copy acceleration" techniques (such as reflinks). This also
-    supports sparse files.
-
-    @param src: path of source file
-    @type src: str
-    @param dst: path of destination file
-    @type dst: str
-    """
-    with (
-        open(src, "rb", buffering=0) as src_file,
-        open(dst, "wb", buffering=0) as dst_file,
-    ):
-        _file_copy(src_file.fileno(), dst_file.fileno())
-
-
-if _file_copy is None:
-    copyfile = shutil.copyfile
-else:
-    copyfile = _optimized_copyfile

diff --git a/lib/portage/util/file_copy/meson.build b/lib/portage/util/file_copy/meson.build
deleted file mode 100644
index 3e1d98f333..0000000000
--- a/lib/portage/util/file_copy/meson.build
+++ /dev/null
@@ -1,7 +0,0 @@
-py.install_sources(
-    [
-        '__init__.py',
-    ],
-    subdir : 'portage/util/file_copy',
-    pure : not native_extensions
-)

diff --git a/lib/portage/util/meson.build b/lib/portage/util/meson.build
index b1e4a1c807..8a60617d6f 100644
--- a/lib/portage/util/meson.build
+++ b/lib/portage/util/meson.build
@@ -10,6 +10,7 @@ py.install_sources(
         'cpuinfo.py',
         'digraph.py',
         'env_update.py',
+        'file_copy.py',
         'formatter.py',
         'hooks.py',
         'install_mask.py',
@@ -41,7 +42,6 @@ py.install_sources(
 
 subdir('elf')
 subdir('endian')
-subdir('file_copy')
 subdir('futures')
 subdir('iterators')
 subdir('_async')

diff --git a/src/meson.build b/src/meson.build
index 6a36724ceb..0220e8d56b 100644
--- a/src/meson.build
+++ b/src/meson.build
@@ -19,23 +19,3 @@ run_command(
     capture : false,
     check : true
 )
-
-if host_machine.system() == 'linux'
-    reflink_ext = py.extension_module(
-        'reflink_linux',
-        'portage_util_file_copy_reflink_linux.c',
-        dependencies : py.dependency(),
-        subdir : 'portage' / 'util' / 'file_copy',
-        install : true
-    )
-
-    run_command(
-        [
-            'ln', '-srnf',
-            reflink_ext.full_path(),
-            meson.project_source_root() / 'lib' / 'portage' / 'util' / 'file_copy/'
-        ],
-        capture : false,
-        check : true
-    )
-endif

diff --git a/src/portage_util_file_copy_reflink_linux.c b/src/portage_util_file_copy_reflink_linux.c
deleted file mode 100644
index e98db3db88..0000000000
--- a/src/portage_util_file_copy_reflink_linux.c
+++ /dev/null
@@ -1,396 +0,0 @@
-/* Copyright 2017-2023 Gentoo Authors
- * Distributed under the terms of the GNU General Public License v2
- */
-
-#include <Python.h>
-#include <errno.h>
-#include <stdlib.h>
-#include <ctype.h>
-#include <sys/sendfile.h>
-#include <sys/stat.h>
-#include <sys/syscall.h>
-#include <sys/types.h>
-#include <unistd.h>
-
-static PyObject * _reflink_linux_file_copy(PyObject *, PyObject *);
-
-static PyMethodDef reflink_linuxMethods[] = {
-    {
-            .ml_name = "file_copy",
-            .ml_meth = _reflink_linux_file_copy,
-            .ml_flags = METH_VARARGS,
-            .ml_doc = "Copy between two file descriptors with reflink and sparse file support."
-    },
-    {NULL, NULL, 0, NULL}
-};
-
-static struct PyModuleDef moduledef = {
-    PyModuleDef_HEAD_INIT,
-    .m_name = "reflink_linux",
-    .m_doc = "Module for reflink_linux copy operations",
-    .m_size = -1,
-    .m_methods = reflink_linuxMethods,
-};
-
-PyMODINIT_FUNC
-PyInit_reflink_linux(void)
-{
-    PyObject *m;
-    m = PyModule_Create(&moduledef);
-    return m;
-}
-
-
-/**
- * cfr_wrapper - A copy_file_range syscall wrapper function, having a
- * function signature that is compatible with sf_wrapper.
- * @fd_out: output file descriptor
- * @fd_in: input file descriptor
- * @off_out: must point to a buffer that specifies the starting offset
- * where bytes will be copied to fd_out, and this buffer is adjusted by
- * the number of bytes copied.
- * @len: number of bytes to copy between the file descriptors
- *
- * Bytes are copied from fd_in starting from *off_out, and the file
- * offset of fd_in is not changed. Effects on the file offset of
- * fd_out are undefined.
- *
- * Return: Number of bytes written to out_fd on success, -1 on failure
- * (errno is set appropriately).
- */
-static ssize_t
-cfr_wrapper(int fd_out, int fd_in, off_t *off_out, size_t len)
-{
-#ifdef __NR_copy_file_range
-    off_t off_in = *off_out;
-    return syscall(__NR_copy_file_range, fd_in, &off_in, fd_out,
-                   off_out, len, 0);
-#else
-    /* This is how it fails at runtime when the syscall is not supported. */
-    errno = ENOSYS;
-    return -1;
-#endif
-}
-
-/**
- * sf_wrapper - A sendfile wrapper function, having a function signature
- * that is compatible with cfr_wrapper.
- * @fd_out: output file descriptor
- * @fd_in: input file descriptor
- * @off_out: must point to a buffer that specifies the starting offset
- * where bytes will be copied to fd_out, and this buffer is adjusted by
- * the number of bytes copied.
- * @len: number of bytes to copy between the file descriptors
- *
- * Bytes are copied from fd_in starting from *off_out, and the file
- * offset of fd_in is not changed. Effects on the file offset of
- * fd_out are undefined.
- *
- * Return: Number of bytes written to out_fd on success, -1 on failure
- * (errno is set appropriately).
- */
-static ssize_t
-sf_wrapper(int fd_out, int fd_in, off_t *off_out, size_t len)
-{
-    ssize_t ret;
-    off_t off_in = *off_out;
-    /* The sendfile docs do not specify behavior of the output file
-     * offset, therefore it must be adjusted with lseek.
-     */
-    if (lseek(fd_out, *off_out, SEEK_SET) < 0)
-        return -1;
-    ret = sendfile(fd_out, fd_in, &off_in, len);
-    if (ret > 0)
-        *off_out += ret;
-    return ret;
-}
-
-
-/**
- * do_lseek_data - Adjust file offsets to the next location containing
- * data, creating sparse empty blocks in the output file as needed.
- * @fd_in: input file descriptor
- * @fd_out: output file descriptor
- * @off_out: offset of the output file
- *
- * Use lseek SEEK_DATA to adjust off_out to the next location from fd_in
- * containing data (creates sparse empty blocks when appropriate). Effects
- * on file offsets are undefined.
- *
- * Return: On success, the number of bytes to copy before the next hole,
- * and -1 on failure (errno is set appropriately). Returns 0 when fd_in
- * reaches EOF.
- */
-static off_t
-do_lseek_data(int fd_out, int fd_in, off_t *off_out) {
-#ifdef SEEK_DATA
-    /* Use lseek SEEK_DATA/SEEK_HOLE for sparse file support,
-     * as suggested in the copy_file_range man page.
-     */
-    off_t offset_data, offset_hole;
-
-    offset_data = lseek(fd_in, *off_out, SEEK_DATA);
-    if (offset_data < 0) {
-        if (errno == ENXIO) {
-            /* EOF - If the file ends with a hole, then use lseek SEEK_END
-             * to find the end offset, and create sparse empty blocks in
-             * the output file. It's the caller's responsibility to
-             * truncate the file.
-             */
-            offset_hole = lseek(fd_in, 0, SEEK_END);
-            if (offset_hole < 0) {
-                return -1;
-            } else if (offset_hole != *off_out) {
-                if (lseek(fd_out, offset_hole, SEEK_SET) < 0) {
-                    return -1;
-                }
-                *off_out = offset_hole;
-            }
-            return 0;
-        }
-        return -1;
-    }
-
-    /* Create sparse empty blocks in the output file, up
-     * until the next location that will contain data.
-     */
-    if (offset_data != *off_out) {
-        if (lseek(fd_out, offset_data, SEEK_SET) < 0) {
-            return -1;
-        }
-        *off_out = offset_data;
-    }
-
-    /* Locate the next hole, so that we know when to
-     * stop copying. There is an implicit hole at the
-     * end of the file. This should never result in ENXIO
-     * after SEEK_DATA has succeeded above.
-     */
-    offset_hole = lseek(fd_in, offset_data, SEEK_HOLE);
-    if (offset_hole < 0) {
-        return -1;
-    }
-
-    return offset_hole - offset_data;
-#else
-    /* This is how it fails at runtime when lseek SEEK_DATA is not supported. */
-    errno = EINVAL;
-    return -1;
-#endif
-}
-
-
-/**
- * _reflink_linux_file_copy - Copy between two file descriptors, with
- * reflink and sparse file support.
- * @fd_in: input file descriptor
- * @fd_out: output file descriptor
- *
- * When supported, this uses copy_file_range for reflink support,
- * and lseek SEEK_DATA for sparse file support. It has graceful
- * fallbacks when support is unavailable for copy_file_range, lseek
- * SEEK_DATA, or sendfile operations. When all else fails, it uses
- * a plain read/write loop that works in any kernel version.
- *
- * If a syscall is interrupted by a signal, then the function will
- * automatically resume copying a the appropriate location which is
- * tracked internally by the offset_out variable.
- *
- * Return: The length of the output file on success. Raise OSError
- * on failure.
- */
-static PyObject *
-_reflink_linux_file_copy(PyObject *self, PyObject *args)
-{
-    int eintr_retry, error, fd_in, fd_out, stat_in_acquired, stat_out_acquired;
-    int lseek_works, sendfile_works;
-    off_t offset_out, len;
-    ssize_t buf_bytes, buf_offset, copyfunc_ret;
-    struct stat stat_in, stat_out;
-    char* buf;
-    ssize_t (*copyfunc)(int, int, off_t *, size_t);
-
-    if (!PyArg_ParseTuple(args, "ii", &fd_in, &fd_out))
-        return NULL;
-
-    eintr_retry = 1;
-    error = 0;
-    offset_out = 0;
-    stat_in_acquired = 0;
-    stat_out_acquired = 0;
-    buf = NULL;
-    buf_bytes = 0;
-    buf_offset = 0;
-    copyfunc = cfr_wrapper;
-    lseek_works = 1;
-    sendfile_works = 1;
-
-    while (eintr_retry) {
-
-        Py_BEGIN_ALLOW_THREADS
-
-        /* Linux 3.1 and later support SEEK_DATA (for sparse file support).
-         * This code uses copy_file_range if possible, and falls back to
-         * sendfile for cross-device or when the copy_file_range syscall
-         * is not available (less than Linux 4.5). This will fail for
-         * Linux less than 3.1, which does not support the lseek SEEK_DATA
-         * parameter.
-         */
-        if (sendfile_works && lseek_works) {
-            error = 0;
-
-            while (1) {
-                len = do_lseek_data(fd_out, fd_in, &offset_out);
-                if (!len) {
-                    /* EOF */
-                    break;
-                } else if (len < 0) {
-                    error = errno;
-                    if ((errno == EINVAL || errno == EOPNOTSUPP) && !offset_out) {
-                        lseek_works = 0;
-                    }
-                    break;
-                }
-
-                copyfunc_ret = copyfunc(fd_out,
-                                        fd_in,
-                                        &offset_out,
-                                        len);
-
-                if (copyfunc_ret <= 0) {
-                    error = errno;
-                    if ((errno == EXDEV || errno == ENOSYS || errno == EOPNOTSUPP || copyfunc_ret == 0) &&
-                        copyfunc == cfr_wrapper) {
-                        /* Use sendfile instead of copy_file_range for
-                         * cross-device copies, or when the copy_file_range
-                         * syscall is not available (less than Linux 4.5),
-                         * or when copy_file_range copies zero bytes.
-                         */
-                        error = 0;
-                        copyfunc = sf_wrapper;
-                        copyfunc_ret = copyfunc(fd_out,
-                                                fd_in,
-                                                &offset_out,
-                                                len);
-
-                        if (copyfunc_ret < 0) {
-                            error = errno;
-                            /* On Linux, if lseek succeeded above, then
-                             * sendfile should have worked here too, so
-                             * don't bother to fallback for EINVAL here.
-                             */
-                            break;
-                        }
-                    } else {
-                        break;
-                    }
-                }
-            }
-        }
-
-        /* Less than Linux 3.1 does not support SEEK_DATA or copy_file_range,
-         * so just use sendfile for in-kernel copy. This will fail for Linux
-         * versions from 2.6.0 to 2.6.32, because sendfile does not support
-         * writing to regular files.
-         */
-        if (sendfile_works && !lseek_works) {
-            error = 0;
-
-            if (!stat_in_acquired && fstat(fd_in, &stat_in) < 0) {
-                error = errno;
-            } else {
-                stat_in_acquired = 1;
-
-                while (offset_out < stat_in.st_size) {
-                    copyfunc_ret = sf_wrapper(fd_out,
-                                              fd_in,
-                                              &offset_out,
-                                              stat_in.st_size - offset_out);
-
-                    if (copyfunc_ret < 0) {
-                        error = errno;
-                        if (errno == EINVAL && !offset_out) {
-                            sendfile_works = 0;
-                        }
-                        break;
-                    }
-                }
-            }
-        }
-
-        /* This implementation will work on any kernel. */
-        if (!sendfile_works) {
-            error = 0;
-
-            if (!stat_out_acquired && fstat(fd_in, &stat_out) < 0) {
-                error = errno;
-            } else {
-                stat_out_acquired = 1;
-                if (buf == NULL)
-                    buf = malloc(stat_out.st_blksize);
-                if (buf == NULL) {
-                    error = errno;
-
-                /* For the read call, the fd_in file offset must be exactly
-                 * equal to offset_out + buf_bytes, where buf_bytes is the
-                 * amount of buffered data that has not been written to
-                 * to the output file yet. Use lseek to ensure correct state,
-                 * in case an EINTR retry caused it to get out of sync
-                 * somewhow.
-                 */
-                } else if (lseek(fd_in, offset_out + buf_bytes, SEEK_SET) < 0) {
-                    error = errno;
-                } else {
-                    while (1) {
-                        /* Some bytes may still be buffered from the
-                         * previous iteration of the outer loop.
-                         */
-                        if (!buf_bytes) {
-                            buf_offset = 0;
-                            buf_bytes = read(fd_in, buf, stat_out.st_blksize);
-
-                            if (!buf_bytes) {
-                                /* EOF */
-                                break;
-
-                            } else if (buf_bytes < 0) {
-                                error = errno;
-                                buf_bytes = 0;
-                                break;
-                            }
-                        }
-
-                        copyfunc_ret = write(fd_out,
-                                             buf + buf_offset,
-                                             buf_bytes);
-
-                        if (copyfunc_ret < 0) {
-                            error = errno;
-                            break;
-                        }
-
-                        buf_bytes -= copyfunc_ret;
-                        buf_offset += copyfunc_ret;
-                        offset_out += copyfunc_ret;
-                    }
-                }
-            }
-        }
-
-        if (!error && ftruncate(fd_out, offset_out) < 0)
-            error = errno;
-
-        Py_END_ALLOW_THREADS
-
-        if (!(error == EINTR && PyErr_CheckSignals() == 0))
-            eintr_retry = 0;
-    }
-
-    if (buf != NULL)
-        free(buf);
-
-    if (error)
-        return PyErr_SetFromErrno(PyExc_OSError);
-
-    return Py_BuildValue("i", offset_out);
-}


^ permalink raw reply related	[flat|nested] only message in thread

only message in thread, other threads:[~2024-03-15 20:06 UTC | newest]

Thread overview: (only message) (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-03-15 20:06 [gentoo-commits] proj/portage:master commit in: lib/portage/util/, lib/portage/util/file_copy/, src/ Mike Gilbert

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox