- 
                Notifications
    You must be signed in to change notification settings 
- Fork 248
Use REP MOVSQ/STOSQ on x86_64 #365
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
          
     Merged
      
      
    
  
     Merged
                    Changes from all commits
      Commits
    
    
            Show all changes
          
          
            10 commits
          
        
        Select commit
          Hold shift + click to select a range
      
      c6c7621
              
                mem: Move mem* functions to separate directory
              
              
                josephlr 80b7c01
              
                memcpy: Create separate memcpy.rs file
              
              
                josephlr ee54782
              
                benches: Add benchmarks for mem* functions
              
              
                josephlr fb03d26
              
                mem: Add REP MOVSB/STOSB implementations
              
              
                josephlr 2a0132c
              
                mem: Add documentations for REP string insturctions
              
              
                josephlr aa75260
              
                Use quad-word rep string instructions
              
              
                josephlr de4ed28
              
                Prevent panic when compiled in debug mode
              
              
                josephlr fe71a12
              
                Add tests for mem* functions
              
              
                josephlr aa326a3
              
                Add build/test with the "asm" feature
              
              
                josephlr d4a180a
              
                Add byte length to Bencher
              
              
                josephlr File filter
Filter by extension
Conversations
          Failed to load comments.   
        
        
          
      Loading
        
  Jump to
        
          Jump to file
        
      
      
          Failed to load files.   
        
        
          
      Loading
        
  Diff view
Diff view
There are no files selected for viewing
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
              
              
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
              | Original file line number | Diff line number | Diff line change | 
|---|---|---|
| @@ -0,0 +1,41 @@ | ||
| use super::c_int; | ||
|  | ||
| #[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)] | ||
| pub unsafe extern "C" fn memcpy(dest: *mut u8, src: *const u8, n: usize) -> *mut u8 { | ||
| let mut i = 0; | ||
| while i < n { | ||
| *dest.offset(i as isize) = *src.offset(i as isize); | ||
| i += 1; | ||
| } | ||
| dest | ||
| } | ||
|  | ||
| #[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)] | ||
| pub unsafe extern "C" fn memmove(dest: *mut u8, src: *const u8, n: usize) -> *mut u8 { | ||
| if src < dest as *const u8 { | ||
| // copy from end | ||
| let mut i = n; | ||
| while i != 0 { | ||
| i -= 1; | ||
| *dest.offset(i as isize) = *src.offset(i as isize); | ||
| } | ||
| } else { | ||
| // copy from beginning | ||
| let mut i = 0; | ||
| while i < n { | ||
| *dest.offset(i as isize) = *src.offset(i as isize); | ||
| i += 1; | ||
| } | ||
| } | ||
| dest | ||
| } | ||
|  | ||
| #[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)] | ||
| pub unsafe extern "C" fn memset(s: *mut u8, c: c_int, n: usize) -> *mut u8 { | ||
| let mut i = 0; | ||
| while i < n { | ||
| *s.offset(i as isize) = c as u8; | ||
| i += 1; | ||
| } | ||
| s | ||
| } | 
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
              
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
              | Original file line number | Diff line number | Diff line change | 
|---|---|---|
| @@ -0,0 +1,79 @@ | ||
| use super::c_int; | ||
|  | ||
| // On most modern Intel and AMD processors, "rep movsq" and "rep stosq" have | ||
| // been enhanced to perform better than an simple qword loop, making them ideal | ||
| // for implementing memcpy/memset. Note that "rep cmps" has received no such | ||
| // enhancement, so it is not used to implement memcmp. | ||
| // | ||
| // On certain recent Intel processors, "rep movsb" and "rep stosb" have been | ||
| // further enhanced to automatically select the best microarchitectural | ||
| // implementation based on length and alignment. See the following features from | ||
| // the "Intel® 64 and IA-32 Architectures Optimization Reference Manual": | ||
| // - ERMSB - Enhanced REP MOVSB and STOSB (Ivy Bridge and later) | ||
| // - FSRM - Fast Short REP MOV (Ice Lake and later) | ||
| // - Fast Zero-Length MOVSB (On no current hardware) | ||
| // - Fast Short STOSB (On no current hardware) | ||
| // However, to avoid run-time feature detection, we don't use these byte-based | ||
| // instructions for most of the copying, preferring the qword variants. | ||
|  | ||
| #[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)] | ||
| pub unsafe extern "C" fn memcpy(dest: *mut u8, src: *const u8, count: usize) -> *mut u8 { | ||
| let qword_count = count >> 3; | ||
| let byte_count = count & 0b111; | ||
| asm!( | ||
| "rep movsq [rdi], [rsi]", | ||
| "mov ecx, {byte_count:e}", | ||
| "rep movsb [rdi], [rsi]", | ||
| byte_count = in(reg) byte_count, | ||
| inout("rcx") qword_count => _, | ||
| inout("rdi") dest => _, | ||
| inout("rsi") src => _, | ||
| options(nostack, preserves_flags) | ||
| ); | ||
| dest | ||
| } | ||
|  | ||
| #[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)] | ||
| pub unsafe extern "C" fn memmove(dest: *mut u8, src: *const u8, count: usize) -> *mut u8 { | ||
| let delta = (dest as usize).wrapping_sub(src as usize); | ||
| if delta >= count { | ||
| // We can copy forwards because either dest is far enough ahead of src, | ||
| // or src is ahead of dest (and delta overflowed). | ||
| return self::memcpy(dest, src, count); | ||
| } | ||
| // copy backwards | ||
| let qword_count = count >> 3; | ||
| let byte_count = count & 0b111; | ||
| asm!( | ||
| "std", | ||
| "rep movsq [rdi], [rsi]", | ||
| "mov ecx, {byte_count:e}", | ||
| "add rdi, 7", | ||
| "add rsi, 7", | ||
| "rep movsb [rdi], [rsi]", | ||
| "cld", | ||
| byte_count = in(reg) byte_count, | ||
| inout("rcx") qword_count => _, | ||
| inout("rdi") dest.offset(count as isize).wrapping_sub(8) => _, | ||
| inout("rsi") src.offset(count as isize).wrapping_sub(8) => _, | ||
| options(nostack) | ||
| ); | ||
| dest | ||
| } | ||
|  | ||
| #[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)] | ||
| pub unsafe extern "C" fn memset(dest: *mut u8, c: c_int, count: usize) -> *mut u8 { | ||
| let qword_count = count >> 3; | ||
| let byte_count = count & 0b111; | ||
| asm!( | ||
| "rep stosq [rdi], rax", | ||
| "mov ecx, {byte_count:e}", | ||
| "rep stosb [rdi], al", | ||
| byte_count = in(reg) byte_count, | ||
| inout("rcx") qword_count => _, | ||
| inout("rdi") dest => _, | ||
| in("rax") (c as u8 as u64) * 0x0101010101010101, | ||
| options(nostack, preserves_flags) | ||
| ); | ||
| dest | ||
| } | ||
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
              | Original file line number | Diff line number | Diff line change | 
|---|---|---|
| @@ -0,0 +1,162 @@ | ||
| #![feature(test)] | ||
|  | ||
| extern crate test; | ||
| use test::{black_box, Bencher}; | ||
|  | ||
| extern crate compiler_builtins; | ||
| use compiler_builtins::mem::{memcmp, memcpy, memmove, memset}; | ||
|  | ||
| fn memcpy_builtin(b: &mut Bencher, n: usize) { | ||
| let v1 = vec![1u8; n]; | ||
| let mut v2 = vec![0u8; n]; | ||
| b.bytes = n as u64; | ||
| b.iter(|| { | ||
|         
                  josephlr marked this conversation as resolved.
              Show resolved
            Hide resolved | ||
| let src: &[u8] = black_box(&v1); | ||
| let dst: &mut [u8] = black_box(&mut v2); | ||
| dst.copy_from_slice(src); | ||
| }) | ||
| } | ||
|  | ||
| fn memcpy_rust(b: &mut Bencher, n: usize) { | ||
| let v1 = vec![1u8; n]; | ||
| let mut v2 = vec![0u8; n]; | ||
| b.bytes = n as u64; | ||
| b.iter(|| { | ||
| let src: &[u8] = black_box(&v1); | ||
| let dst: &mut [u8] = black_box(&mut v2); | ||
| unsafe { memcpy(dst.as_mut_ptr(), src.as_ptr(), n) } | ||
| }) | ||
| } | ||
|  | ||
| fn memset_builtin(b: &mut Bencher, n: usize) { | ||
| let mut v1 = vec![0u8; n]; | ||
| b.bytes = n as u64; | ||
| b.iter(|| { | ||
| let dst: &mut [u8] = black_box(&mut v1); | ||
| let val: u8 = black_box(27); | ||
| for b in dst { | ||
| *b = val; | ||
| } | ||
| }) | ||
| } | ||
|  | ||
| fn memset_rust(b: &mut Bencher, n: usize) { | ||
| let mut v1 = vec![0u8; n]; | ||
| b.bytes = n as u64; | ||
| b.iter(|| { | ||
| let dst: &mut [u8] = black_box(&mut v1); | ||
| let val = black_box(27); | ||
| unsafe { memset(dst.as_mut_ptr(), val, n) } | ||
| }) | ||
| } | ||
|  | ||
| fn memcmp_builtin(b: &mut Bencher, n: usize) { | ||
| let v1 = vec![0u8; n]; | ||
| let mut v2 = vec![0u8; n]; | ||
| v2[n - 1] = 1; | ||
| b.bytes = n as u64; | ||
| b.iter(|| { | ||
| let s1: &[u8] = black_box(&v1); | ||
| let s2: &[u8] = black_box(&v2); | ||
| s1.cmp(s2) | ||
| }) | ||
| } | ||
|  | ||
| fn memcmp_rust(b: &mut Bencher, n: usize) { | ||
| let v1 = vec![0u8; n]; | ||
| let mut v2 = vec![0u8; n]; | ||
| v2[n - 1] = 1; | ||
| b.bytes = n as u64; | ||
| b.iter(|| { | ||
| let s1: &[u8] = black_box(&v1); | ||
| let s2: &[u8] = black_box(&v2); | ||
| unsafe { memcmp(s1.as_ptr(), s2.as_ptr(), n) } | ||
| }) | ||
| } | ||
|  | ||
| fn memmove_builtin(b: &mut Bencher, n: usize) { | ||
| let mut v = vec![0u8; n + n / 2]; | ||
| b.bytes = n as u64; | ||
| b.iter(|| { | ||
| let s: &mut [u8] = black_box(&mut v); | ||
| s.copy_within(0..n, n / 2); | ||
| }) | ||
| } | ||
|  | ||
| fn memmove_rust(b: &mut Bencher, n: usize) { | ||
| let mut v = vec![0u8; n + n / 2]; | ||
| b.bytes = n as u64; | ||
| b.iter(|| { | ||
| let dst: *mut u8 = black_box(&mut v[n / 2..]).as_mut_ptr(); | ||
| let src: *const u8 = black_box(&v).as_ptr(); | ||
| unsafe { memmove(dst, src, n) }; | ||
| }) | ||
| } | ||
|  | ||
| #[bench] | ||
| fn memcpy_builtin_4096(b: &mut Bencher) { | ||
| memcpy_builtin(b, 4096) | ||
| } | ||
| #[bench] | ||
| fn memcpy_rust_4096(b: &mut Bencher) { | ||
| memcpy_rust(b, 4096) | ||
| } | ||
| #[bench] | ||
| fn memcpy_builtin_1048576(b: &mut Bencher) { | ||
| memcpy_builtin(b, 1048576) | ||
| } | ||
| #[bench] | ||
| fn memcpy_rust_1048576(b: &mut Bencher) { | ||
| memcpy_rust(b, 1048576) | ||
| } | ||
|  | ||
| #[bench] | ||
| fn memset_builtin_4096(b: &mut Bencher) { | ||
| memset_builtin(b, 4096) | ||
| } | ||
| #[bench] | ||
| fn memset_rust_4096(b: &mut Bencher) { | ||
| memset_rust(b, 4096) | ||
| } | ||
| #[bench] | ||
| fn memset_builtin_1048576(b: &mut Bencher) { | ||
| memset_builtin(b, 1048576) | ||
| } | ||
| #[bench] | ||
| fn memset_rust_1048576(b: &mut Bencher) { | ||
| memset_rust(b, 1048576) | ||
| } | ||
|  | ||
| #[bench] | ||
| fn memcmp_builtin_4096(b: &mut Bencher) { | ||
| memcmp_builtin(b, 4096) | ||
| } | ||
| #[bench] | ||
| fn memcmp_rust_4096(b: &mut Bencher) { | ||
| memcmp_rust(b, 4096) | ||
| } | ||
| #[bench] | ||
| fn memcmp_builtin_1048576(b: &mut Bencher) { | ||
| memcmp_builtin(b, 1048576) | ||
| } | ||
| #[bench] | ||
| fn memcmp_rust_1048576(b: &mut Bencher) { | ||
| memcmp_rust(b, 1048576) | ||
| } | ||
|  | ||
| #[bench] | ||
| fn memmove_builtin_4096(b: &mut Bencher) { | ||
| memmove_builtin(b, 4096) | ||
| } | ||
| #[bench] | ||
| fn memmove_rust_4096(b: &mut Bencher) { | ||
| memmove_rust(b, 4096) | ||
| } | ||
| #[bench] | ||
| fn memmove_builtin_1048576(b: &mut Bencher) { | ||
| memmove_builtin(b, 1048576) | ||
| } | ||
| #[bench] | ||
| fn memmove_rust_1048576(b: &mut Bencher) { | ||
| memmove_rust(b, 1048576) | ||
| } | ||
      
      Oops, something went wrong.
        
    
  
  Add this suggestion to a batch that can be applied as a single commit.
  This suggestion is invalid because no changes were made to the code.
  Suggestions cannot be applied while the pull request is closed.
  Suggestions cannot be applied while viewing a subset of changes.
  Only one suggestion per line can be applied in a batch.
  Add this suggestion to a batch that can be applied as a single commit.
  Applying suggestions on deleted lines is not supported.
  You must change the existing code in this line in order to create a valid suggestion.
  Outdated suggestions cannot be applied.
  This suggestion has been applied or marked resolved.
  Suggestions cannot be applied from pending reviews.
  Suggestions cannot be applied on multi-line comments.
  Suggestions cannot be applied while the pull request is queued to merge.
  Suggestion cannot be applied right now. Please check back later.
  
    
  
    
Uh oh!
There was an error while loading. Please reload this page.