diff --git a/0001-add-PyASCIIObject-state-bitfield-access-wrappers.patch b/0001-add-PyASCIIObject-state-bitfield-access-wrappers.patch new file mode 100644 index 0000000..fd8b4ef --- /dev/null +++ b/0001-add-PyASCIIObject-state-bitfield-access-wrappers.patch @@ -0,0 +1,393 @@ +diff --git a/src/cpython/unicodeobject.rs b/src/cpython/unicodeobject.rs +index 9ec21bf6989..27b55ef8309 100644 +--- a/src/cpython/unicodeobject.rs ++++ b/src/cpython/unicodeobject.rs +@@ -30,6 +30,184 @@ use std::os::raw::{c_char, c_int, c_uint, c_void}; + // skipped Py_UNICODE_HIGH_SURROGATE + // skipped Py_UNICODE_LOW_SURROGATE + ++// generated by bindgen v0.63.0 (with small adaptations) ++#[repr(C)] ++struct BitfieldUnit { ++ storage: Storage, ++} ++ ++impl BitfieldUnit { ++ #[inline] ++ pub const fn new(storage: Storage) -> Self { ++ Self { storage } ++ } ++} ++ ++impl BitfieldUnit ++where ++ Storage: AsRef<[u8]> + AsMut<[u8]>, ++{ ++ #[inline] ++ fn get_bit(&self, index: usize) -> bool { ++ debug_assert!(index / 8 < self.storage.as_ref().len()); ++ let byte_index = index / 8; ++ let byte = self.storage.as_ref()[byte_index]; ++ let bit_index = if cfg!(target_endian = "big") { ++ 7 - (index % 8) ++ } else { ++ index % 8 ++ }; ++ let mask = 1 << bit_index; ++ byte & mask == mask ++ } ++ ++ #[inline] ++ fn set_bit(&mut self, index: usize, val: bool) { ++ debug_assert!(index / 8 < self.storage.as_ref().len()); ++ let byte_index = index / 8; ++ let byte = &mut self.storage.as_mut()[byte_index]; ++ let bit_index = if cfg!(target_endian = "big") { ++ 7 - (index % 8) ++ } else { ++ index % 8 ++ }; ++ let mask = 1 << bit_index; ++ if val { ++ *byte |= mask; ++ } else { ++ *byte &= !mask; ++ } ++ } ++ ++ #[inline] ++ fn get(&self, bit_offset: usize, bit_width: u8) -> u64 { ++ debug_assert!(bit_width <= 64); ++ debug_assert!(bit_offset / 8 < self.storage.as_ref().len()); ++ debug_assert!((bit_offset + (bit_width as usize)) / 8 <= self.storage.as_ref().len()); ++ let mut val = 0; ++ for i in 0..(bit_width as usize) { ++ if self.get_bit(i + bit_offset) { ++ let index = if cfg!(target_endian = "big") { ++ bit_width as usize - 1 - i ++ } else { ++ i ++ }; ++ val |= 1 << index; ++ } ++ } ++ val ++ } ++ ++ #[inline] ++ fn set(&mut self, bit_offset: usize, bit_width: u8, val: u64) { ++ debug_assert!(bit_width <= 64); ++ debug_assert!(bit_offset / 8 < self.storage.as_ref().len()); ++ debug_assert!((bit_offset + (bit_width as usize)) / 8 <= self.storage.as_ref().len()); ++ for i in 0..(bit_width as usize) { ++ let mask = 1 << i; ++ let val_bit_is_set = val & mask == mask; ++ let index = if cfg!(target_endian = "big") { ++ bit_width as usize - 1 - i ++ } else { ++ i ++ }; ++ self.set_bit(index + bit_offset, val_bit_is_set); ++ } ++ } ++} ++ ++// generated by bindgen v0.63.0 (with small adaptations) ++// The same code is generated for Python 3.7, 3.8, 3.9, 3.10, and 3.11, but the "ready" field ++// has been removed from Python 3.12. ++ ++/// Wrapper around the `PyASCIIObject.state` bitfield with getters and setters that work ++/// on most little- and big-endian architectures. ++/// ++/// Memory layout of C bitfields is implementation defined, so these functions are still ++/// unsafe. Users must verify that they work as expected on the architectures they target. ++#[repr(C)] ++#[repr(align(4))] ++struct PyASCIIObjectState { ++ _bitfield_align: [u8; 0], ++ _bitfield: BitfieldUnit<[u8; 4usize]>, ++} ++ ++// c_uint and u32 are not necessarily the same type on all targets / architectures ++#[allow(clippy::useless_transmute)] ++impl PyASCIIObjectState { ++ #[inline] ++ unsafe fn interned(&self) -> c_uint { ++ std::mem::transmute(self._bitfield.get(0usize, 2u8) as u32) ++ } ++ ++ #[inline] ++ unsafe fn set_interned(&mut self, val: c_uint) { ++ let val: u32 = std::mem::transmute(val); ++ self._bitfield.set(0usize, 2u8, val as u64) ++ } ++ ++ #[inline] ++ unsafe fn kind(&self) -> c_uint { ++ std::mem::transmute(self._bitfield.get(2usize, 3u8) as u32) ++ } ++ ++ #[inline] ++ unsafe fn set_kind(&mut self, val: c_uint) { ++ let val: u32 = std::mem::transmute(val); ++ self._bitfield.set(2usize, 3u8, val as u64) ++ } ++ ++ #[inline] ++ unsafe fn compact(&self) -> c_uint { ++ std::mem::transmute(self._bitfield.get(5usize, 1u8) as u32) ++ } ++ ++ #[inline] ++ unsafe fn set_compact(&mut self, val: c_uint) { ++ let val: u32 = std::mem::transmute(val); ++ self._bitfield.set(5usize, 1u8, val as u64) ++ } ++ ++ #[inline] ++ unsafe fn ascii(&self) -> c_uint { ++ std::mem::transmute(self._bitfield.get(6usize, 1u8) as u32) ++ } ++ ++ #[inline] ++ unsafe fn set_ascii(&mut self, val: c_uint) { ++ let val: u32 = std::mem::transmute(val); ++ self._bitfield.set(6usize, 1u8, val as u64) ++ } ++ ++ #[inline] ++ unsafe fn ready(&self) -> c_uint { ++ std::mem::transmute(self._bitfield.get(7usize, 1u8) as u32) ++ } ++ ++ #[inline] ++ unsafe fn set_ready(&mut self, val: c_uint) { ++ let val: u32 = std::mem::transmute(val); ++ self._bitfield.set(7usize, 1u8, val as u64) ++ } ++} ++ ++impl From for PyASCIIObjectState { ++ #[inline] ++ fn from(value: u32) -> Self { ++ PyASCIIObjectState { ++ _bitfield_align: [], ++ _bitfield: BitfieldUnit::new(value.to_ne_bytes()), ++ } ++ } ++} ++ ++impl From for u32 { ++ #[inline] ++ fn from(value: PyASCIIObjectState) -> Self { ++ u32::from_ne_bytes(value._bitfield.storage) ++ } ++} ++ + #[repr(C)] + pub struct PyASCIIObject { + pub ob_base: PyObject, +@@ -52,34 +230,98 @@ pub struct PyASCIIObject { + } + + /// Interacting with the bitfield is not actually well-defined, so we mark these APIs unsafe. +-/// +-/// In addition, they are disabled on big-endian architectures to restrict this to most "common" +-/// platforms, which are at least tested on CI and appear to be sound. +-#[cfg(target_endian = "little")] + impl PyASCIIObject { ++ /// Get the `interned` field of the [`PyASCIIObject`] state bitfield. ++ /// ++ /// Returns one of: [`SSTATE_NOT_INTERNED`], [`SSTATE_INTERNED_MORTAL`], [`SSTATE_INTERNED_IMMORTAL`] + #[inline] + pub unsafe fn interned(&self) -> c_uint { +- self.state & 3 ++ PyASCIIObjectState::from(self.state).interned() + } + ++ /// Set the `interned` field of the [`PyASCIIObject`] state bitfield. ++ /// ++ /// Calling this function with an argument that is not [`SSTATE_NOT_INTERNED`], ++ /// [`SSTATE_INTERNED_MORTAL`], or [`SSTATE_INTERNED_IMMORTAL`] is invalid. ++ #[inline] ++ pub unsafe fn set_interned(&mut self, val: c_uint) { ++ let mut state = PyASCIIObjectState::from(self.state); ++ state.set_interned(val); ++ self.state = u32::from(state); ++ } ++ ++ /// Get the `kind` field of the [`PyASCIIObject`] state bitfield. ++ /// ++ /// Returns one of: [`PyUnicode_WCHAR_KIND`], [`PyUnicode_1BYTE_KIND`], [`PyUnicode_2BYTE_KIND`], ++ /// [`PyUnicode_4BYTE_KIND`] + #[inline] + pub unsafe fn kind(&self) -> c_uint { +- (self.state >> 2) & 7 ++ PyASCIIObjectState::from(self.state).kind() + } + ++ /// Set the `kind` field of the [`PyASCIIObject`] state bitfield. ++ /// ++ /// Calling this function with an argument that is not [`PyUnicode_WCHAR_KIND`], [`PyUnicode_1BYTE_KIND`], ++ /// [`PyUnicode_2BYTE_KIND`], or [`PyUnicode_4BYTE_KIND`] is invalid. ++ #[inline] ++ pub unsafe fn set_kind(&mut self, val: c_uint) { ++ let mut state = PyASCIIObjectState::from(self.state); ++ state.set_kind(val); ++ self.state = u32::from(state); ++ } ++ ++ /// Get the `compact` field of the [`PyASCIIObject`] state bitfield. ++ /// ++ /// Returns either `0` or `1`. + #[inline] + pub unsafe fn compact(&self) -> c_uint { +- (self.state >> 5) & 1 ++ PyASCIIObjectState::from(self.state).compact() ++ } ++ ++ /// Set the `compact` flag of the [`PyASCIIObject`] state bitfield. ++ /// ++ /// Calling this function with an argument that is neither `0` nor `1` is invalid. ++ #[inline] ++ pub unsafe fn set_compact(&mut self, val: c_uint) { ++ let mut state = PyASCIIObjectState::from(self.state); ++ state.set_compact(val); ++ self.state = u32::from(state); + } + ++ /// Get the `ascii` field of the [`PyASCIIObject`] state bitfield. ++ /// ++ /// Returns either `0` or `1`. + #[inline] + pub unsafe fn ascii(&self) -> c_uint { +- (self.state >> 6) & 1 ++ PyASCIIObjectState::from(self.state).ascii() + } + ++ /// Set the `ascii` flag of the [`PyASCIIObject`] state bitfield. ++ /// ++ /// Calling this function with an argument that is neither `0` nor `1` is invalid. ++ #[inline] ++ pub unsafe fn set_ascii(&mut self, val: c_uint) { ++ let mut state = PyASCIIObjectState::from(self.state); ++ state.set_ascii(val); ++ self.state = u32::from(state); ++ } ++ ++ /// Get the `ready` field of the [`PyASCIIObject`] state bitfield. ++ /// ++ /// Returns either `0` or `1`. + #[inline] + pub unsafe fn ready(&self) -> c_uint { +- (self.state >> 7) & 1 ++ PyASCIIObjectState::from(self.state).ready() ++ } ++ ++ /// Set the `ready` flag of the [`PyASCIIObject`] state bitfield. ++ /// ++ /// Calling this function with an argument that is neither `0` nor `1` is invalid. ++ #[inline] ++ pub unsafe fn set_ready(&mut self, val: c_uint) { ++ let mut state = PyASCIIObjectState::from(self.state); ++ state.set_ready(val); ++ self.state = u32::from(state); + } + } + +@@ -120,7 +362,6 @@ pub const SSTATE_INTERNED_MORTAL: c_uint = 1; + pub const SSTATE_INTERNED_IMMORTAL: c_uint = 2; + + #[inline] +-#[cfg(target_endian = "little")] + pub unsafe fn PyUnicode_IS_ASCII(op: *mut PyObject) -> c_uint { + debug_assert!(crate::PyUnicode_Check(op) != 0); + debug_assert!(PyUnicode_IS_READY(op) != 0); +@@ -129,13 +370,11 @@ pub unsafe fn PyUnicode_IS_ASCII(op: *mut PyObject) -> c_uint { + } + + #[inline] +-#[cfg(target_endian = "little")] + pub unsafe fn PyUnicode_IS_COMPACT(op: *mut PyObject) -> c_uint { + (*(op as *mut PyASCIIObject)).compact() + } + + #[inline] +-#[cfg(target_endian = "little")] + pub unsafe fn PyUnicode_IS_COMPACT_ASCII(op: *mut PyObject) -> c_uint { + ((*(op as *mut PyASCIIObject)).ascii() != 0 && PyUnicode_IS_COMPACT(op) != 0).into() + } +@@ -149,25 +388,21 @@ pub const PyUnicode_2BYTE_KIND: c_uint = 2; + pub const PyUnicode_4BYTE_KIND: c_uint = 4; + + #[inline] +-#[cfg(target_endian = "little")] + pub unsafe fn PyUnicode_1BYTE_DATA(op: *mut PyObject) -> *mut Py_UCS1 { + PyUnicode_DATA(op) as *mut Py_UCS1 + } + + #[inline] +-#[cfg(target_endian = "little")] + pub unsafe fn PyUnicode_2BYTE_DATA(op: *mut PyObject) -> *mut Py_UCS2 { + PyUnicode_DATA(op) as *mut Py_UCS2 + } + + #[inline] +-#[cfg(target_endian = "little")] + pub unsafe fn PyUnicode_4BYTE_DATA(op: *mut PyObject) -> *mut Py_UCS4 { + PyUnicode_DATA(op) as *mut Py_UCS4 + } + + #[inline] +-#[cfg(target_endian = "little")] + pub unsafe fn PyUnicode_KIND(op: *mut PyObject) -> c_uint { + debug_assert!(crate::PyUnicode_Check(op) != 0); + debug_assert!(PyUnicode_IS_READY(op) != 0); +@@ -176,7 +411,6 @@ pub unsafe fn PyUnicode_KIND(op: *mut PyObject) -> c_uint { + } + + #[inline] +-#[cfg(target_endian = "little")] + pub unsafe fn _PyUnicode_COMPACT_DATA(op: *mut PyObject) -> *mut c_void { + if PyUnicode_IS_ASCII(op) != 0 { + (op as *mut PyASCIIObject).offset(1) as *mut c_void +@@ -186,7 +420,6 @@ pub unsafe fn _PyUnicode_COMPACT_DATA(op: *mut PyObject) -> *mut c_void { + } + + #[inline] +-#[cfg(target_endian = "little")] + pub unsafe fn _PyUnicode_NONCOMPACT_DATA(op: *mut PyObject) -> *mut c_void { + debug_assert!(!(*(op as *mut PyUnicodeObject)).data.any.is_null()); + +@@ -194,7 +427,6 @@ pub unsafe fn _PyUnicode_NONCOMPACT_DATA(op: *mut PyObject) -> *mut c_void { + } + + #[inline] +-#[cfg(target_endian = "little")] + pub unsafe fn PyUnicode_DATA(op: *mut PyObject) -> *mut c_void { + debug_assert!(crate::PyUnicode_Check(op) != 0); + +@@ -210,7 +442,6 @@ pub unsafe fn PyUnicode_DATA(op: *mut PyObject) -> *mut c_void { + // skipped PyUnicode_READ_CHAR + + #[inline] +-#[cfg(target_endian = "little")] + pub unsafe fn PyUnicode_GET_LENGTH(op: *mut PyObject) -> Py_ssize_t { + debug_assert!(crate::PyUnicode_Check(op) != 0); + debug_assert!(PyUnicode_IS_READY(op) != 0); +@@ -219,7 +450,6 @@ pub unsafe fn PyUnicode_GET_LENGTH(op: *mut PyObject) -> Py_ssize_t { + } + + #[inline] +-#[cfg(target_endian = "little")] + pub unsafe fn PyUnicode_IS_READY(op: *mut PyObject) -> c_uint { + (*(op as *mut PyASCIIObject)).ready() + } +@@ -227,7 +457,6 @@ pub unsafe fn PyUnicode_IS_READY(op: *mut PyObject) -> c_uint { + #[cfg(not(Py_3_12))] + #[cfg_attr(Py_3_10, deprecated(note = "Python 3.10"))] + #[inline] +-#[cfg(target_endian = "little")] + pub unsafe fn PyUnicode_READY(op: *mut PyObject) -> c_int { + debug_assert!(crate::PyUnicode_Check(op) != 0); + diff --git a/rust-pyo3-ffi.spec b/rust-pyo3-ffi.spec index 7518850..8fdad32 100644 --- a/rust-pyo3-ffi.spec +++ b/rust-pyo3-ffi.spec @@ -15,6 +15,9 @@ Source: %{crates_source} # Manually created patch for downstream crate metadata changes # * drop MSVC- and MinGW-only features Patch: pyo3-ffi-fix-metadata.diff +# * backport upstreamed patch make PyASCIIObject available on big-endian arches +# https://github.com/PyO3/pyo3/commit/40d6d47 +Patch: 0001-add-PyASCIIObject-state-bitfield-access-wrappers.patch BuildRequires: rust-packaging >= 21