multithreading - How to get near optimal parallel efficiency for this simple Julia code?
问题描述
I have the following simple code:
function hamming4(bits1::Integer, bits2::Integer)
return count_ones(bits1 ⊻ bits2)
end
function random_strings2(n, N)
mask = UInt128(1) << n - 1
return [rand(UInt128) & mask for i in 1:N]
end
function find_min(strings, n, N)
minsofar = fill(n, Threads.nthreads())
# minsofar = n
Threads.@threads for i in 1:N
# for i in 1:N
for j in i+1:N
dist = hamming4(strings[i], strings[j])
if dist < minsofar[Threads.threadid()]
minsofar[Threads.threadid()] = dist
end
end
end
return minimum(minsofar)
#return minsofar
end
function ave_min(n, N)
ITER = 10
strings = random_strings2(n, N)
new_min = find_min(strings, n, N)
avesofar = new_min
# print("New min ", new_min, ". New ave ", avesofar, "\n")
total = avesofar
for i in 1:ITER-1
strings = random_strings2(n, N)
new_min = find_min(strings, n, N)
avesofar = avesofar*(i/(i+1)) + new_min/(i+1)
print("Iteration ", i, ". New min ", new_min, ". New ave ", round(avesofar; digits=2), "\n")
end
return avesofar
end
N = 2^16
n = 99
print("Overall average ", ave_min(n, N), "\n")
When I run it on an AMD 8350 in linux the CPU usage is around 430% (instead of close to 800%).
Is it possible to make the parallelisation work more efficiently?
Also, I noticed a new very impressive looking package called LoopVectorization.jl. As I am computing the Hamming distance in what looks like a vectorizable way, is it possible to speed up the code this way too?
Can the code be vectorized using LoopVectorization.jl?
(I am completely new to Julia)
解决方案
您的代码的并行化似乎是正确的。
您很可能在 Atom 或其他 IDE 中运行它。默认情况下,Atom 仅使用一半 o 核心(更准确地说,仅使用物理而非逻辑核心)。
例如在我的机器上在 Atom 中运行:
julia> Threads.nthreads()
4
您需要做的是显式设置JULIA_NUM_THREADS
Windows 命令行(仍然假设 8 个逻辑核心)
set JULIA_NUM_THREADS=8
Linux 命令行
export JULIA_NUM_THREADS=8
之后,您的代码将 100% 占用我所有的内核。
编辑
Distributed
经过讨论 - 您可以通过使用而不是将时间减少到单线程时间的 20% 左右,Threads
因为这避免了内存共享:
代码或多或少看起来像这样:
using Distributed
addprocs(8)
@everywhere function hamming4(bits1::Integer, bits2::Integer)
return count_ones(bits1 ⊻ bits2)
end
function random_strings2(n, N)
mask = UInt128(1) << n - 1
return [rand(UInt128) & mask for i in 1:N]
end
function find_min(strings, n, N)
return @distributed (min) for i in 1:N-1
minimum(hamming4(strings[i], strings[j]) for j in i+1:N)
end
end
### ... the rest of code remains unchanged
推荐阅读
- ios - MPMusicPlayerController 和 iOS 11.3
- apache - Http/1.1 服务不可用
- r - 从 RStudio 终端访问 Rust
- ios - 按位置打印数组显示为空
- java - 类 GamesActivity 中的方法 updateGames 不能是 App
- sql - 这个 ERD 图正确吗?
- powershell - powershell:从csv文件中提取数值之前的数据
- design-patterns - 评估不同类型规则集的设计模式
- php - 无法连接 QuickBooks PHP DevKit
- vba - Microsoft Word 句子长度检查器