Skip to content

Commit 8ddb13c

Browse files
committed
Optimized coo element wise addition, not tested
1 parent f585ea2 commit 8ddb13c

5 files changed

Lines changed: 245 additions & 28 deletions

File tree

GraphBLAS-sharp.sln

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@ Project("{F2A71F9B-5D33-465A-A702-920D77279786}") = "GraphBLAS-sharp.Benchmarks"
1717
EndProject
1818
Project("{F2A71F9B-5D33-465A-A702-920D77279786}") = "docsTool", "docsTool\docsTool.fsproj", "{8855EC73-F6A1-43D3-AFBC-04A3E09F9BD9}"
1919
EndProject
20+
Project("{F2A71F9B-5D33-465A-A702-920D77279786}") = "App", "src\App\App.fsproj", "{6821C17E-1AB2-4D2E-B97F-17C36E6DFF32}"
21+
EndProject
2022
Global
2123
GlobalSection(SolutionConfigurationPlatforms) = preSolution
2224
Debug|Any CPU = Debug|Any CPU
@@ -78,10 +80,23 @@ Global
7880
{4C6EB3D0-B6BF-4FF5-BC77-CC7CB3F307E6}.Release|x64.Build.0 = Release|Any CPU
7981
{4C6EB3D0-B6BF-4FF5-BC77-CC7CB3F307E6}.Release|x86.ActiveCfg = Release|Any CPU
8082
{4C6EB3D0-B6BF-4FF5-BC77-CC7CB3F307E6}.Release|x86.Build.0 = Release|Any CPU
83+
{6821C17E-1AB2-4D2E-B97F-17C36E6DFF32}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
84+
{6821C17E-1AB2-4D2E-B97F-17C36E6DFF32}.Debug|Any CPU.Build.0 = Debug|Any CPU
85+
{6821C17E-1AB2-4D2E-B97F-17C36E6DFF32}.Debug|x64.ActiveCfg = Debug|Any CPU
86+
{6821C17E-1AB2-4D2E-B97F-17C36E6DFF32}.Debug|x64.Build.0 = Debug|Any CPU
87+
{6821C17E-1AB2-4D2E-B97F-17C36E6DFF32}.Debug|x86.ActiveCfg = Debug|Any CPU
88+
{6821C17E-1AB2-4D2E-B97F-17C36E6DFF32}.Debug|x86.Build.0 = Debug|Any CPU
89+
{6821C17E-1AB2-4D2E-B97F-17C36E6DFF32}.Release|Any CPU.ActiveCfg = Release|Any CPU
90+
{6821C17E-1AB2-4D2E-B97F-17C36E6DFF32}.Release|Any CPU.Build.0 = Release|Any CPU
91+
{6821C17E-1AB2-4D2E-B97F-17C36E6DFF32}.Release|x64.ActiveCfg = Release|Any CPU
92+
{6821C17E-1AB2-4D2E-B97F-17C36E6DFF32}.Release|x64.Build.0 = Release|Any CPU
93+
{6821C17E-1AB2-4D2E-B97F-17C36E6DFF32}.Release|x86.ActiveCfg = Release|Any CPU
94+
{6821C17E-1AB2-4D2E-B97F-17C36E6DFF32}.Release|x86.Build.0 = Release|Any CPU
8195
EndGlobalSection
8296
GlobalSection(NestedProjects) = preSolution
8397
{5D30E174-2538-47AC-8443-318C8C5DC2C9} = {C397A34C-84F1-49E7-AEBC-2F9F2B196216}
8498
{1CA2E092-2320-451D-A4F0-9ED7C7C528CA} = {ACBEE43C-7A88-4FB1-9B06-DB064D22B29F}
8599
{4C6EB3D0-B6BF-4FF5-BC77-CC7CB3F307E6} = {DEF656DE-BCED-4C49-B5ED-950D4A29B78B}
100+
{6821C17E-1AB2-4D2E-B97F-17C36E6DFF32} = {C397A34C-84F1-49E7-AEBC-2F9F2B196216}
86101
EndGlobalSection
87102
EndGlobal

src/App/App.fsproj

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
<Project Sdk="Microsoft.NET.Sdk">
2+
3+
<PropertyGroup>
4+
<OutputType>Exe</OutputType>
5+
<TargetFramework>netcoreapp3.1</TargetFramework>
6+
</PropertyGroup>
7+
8+
<ItemGroup>
9+
<Compile Include="Program.fs" />
10+
</ItemGroup>
11+
12+
<ItemGroup>
13+
<ProjectReference Include="..\Graphblas-sharp\Graphblas-sharp.fsproj" />
14+
</ItemGroup>
15+
16+
</Project>

src/App/Program.fs

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
2+
open System
3+
open GraphBLAS.FSharp
4+
5+
open Brahma.OpenCL
6+
open Brahma.FSharp.OpenCL.Core
7+
open Brahma.FSharp.OpenCL.Extensions
8+
open GlobalContext
9+
open Helpers
10+
open FSharp.Quotations.Evaluator
11+
open Brahma.FSharp.OpenCL.WorkflowBuilder.Basic
12+
open Brahma.FSharp.OpenCL.WorkflowBuilder.Evaluation
13+
open GraphBLAS.FSharp.Predefined
14+
15+
[<EntryPoint>]
16+
let main argv =
17+
let fstMatrix = COOMatrix(100, 100, [|0;1;2|], [|0;1;2|], [|1.;2.;3.|])
18+
let sndMatrix = COOMatrix(100, 100, [|0;1;2|], [|0;1;2|], [|1.;2.;3.|])
19+
let workflow =
20+
opencl {
21+
let! result = fstMatrix.EWiseAdd sndMatrix None FloatSemiring.addMult
22+
let! _ = result.ToHost ()
23+
return result
24+
}
25+
let res: COOMatrix<float> = downcast oclContext.RunSync workflow
26+
27+
let indices = res.Indices
28+
let values = res.Values
29+
30+
for i in 0 .. indices.Length - 1 do
31+
let index = indices.[i]
32+
let i, j = int <| index >>> 32, int index
33+
printfn "(%i, %i, %A)" i j values.[i]
34+
35+
0

src/GraphBLAS-sharp/Implementations.fs

Lines changed: 69 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -251,36 +251,75 @@ and COOMatrix<'a when 'a : struct and 'a : equality>(rowCount: int, columnCount:
251251
let i = ndRange.GlobalID0
252252

253253
if i < allIndicesLength then
254-
let f n = if 0 > n + 1 - shortSide then 0 else n + 1 - shortSide
255-
let mutable leftEdge = f i
256-
257-
let g n = if n > longSide - 1 then longSide - 1 else n
258-
let mutable rightEdge = g i
254+
let knots = localArray<int> 2
255+
let localID = ndRange.LocalID0
256+
if localID < 2 then
257+
let mutable x = localID * (workGroupSize - 1) + i - 1
258+
if x >= shortSide + longSide then x <- shortSide + longSide - 1
259+
let diagonalNumber = x
260+
261+
let mutable leftEdge = diagonalNumber + 1 - shortSide
262+
if leftEdge < 0 then leftEdge <- 0
263+
264+
let mutable rightEdge = longSide - 1
265+
if rightEdge > diagonalNumber then rightEdge <- diagonalNumber
266+
267+
while leftEdge <= rightEdge do
268+
let middleIdx = (leftEdge + rightEdge) / 2
269+
let firstIndex = firstIndicesBuffer.[middleIdx]
270+
let secondIndex = secondIndicesBuffer.[diagonalNumber - middleIdx]
271+
if firstIndex < secondIndex then leftEdge <- middleIdx + 1 else rightEdge <- middleIdx - 1
272+
273+
knots.[localID] <- leftEdge
274+
barrier ()
275+
276+
let beginIdx = knots.[0] // BANK CONFLICTS?
277+
let endIdx = knots.[1]
278+
let firstLocalLength = endIdx - beginIdx
279+
let mutable x = workGroupSize - firstLocalLength
280+
if endIdx = longSide then x <- shortSide - i + localID + beginIdx
281+
let secondLocalLength = x
282+
283+
//First indices are from 0 to firstLocalLength - 1 inclusive
284+
//Second indices are from firstLocalLength to firstLocalLength + secondLocalLength - 1 inclusive
285+
let localIndices = localArray<uint64> workGroupSize
286+
287+
if localID < firstLocalLength then
288+
localIndices.[localID] <- firstIndicesBuffer.[beginIdx + localID]
289+
if localID < secondLocalLength then
290+
localIndices.[firstLocalLength + localID] <- secondIndicesBuffer.[i - beginIdx]
291+
barrier ()
292+
293+
let mutable leftEdge = localID + 1 - secondLocalLength
294+
if leftEdge < 0 then leftEdge <- 0
295+
296+
let mutable rightEdge = firstLocalLength - 1
297+
if rightEdge > localID then rightEdge <- localID
259298

260299
while leftEdge <= rightEdge do
261300
let middleIdx = (leftEdge + rightEdge) / 2
262-
let firstIndex = firstIndicesBuffer.[middleIdx]
263-
let secondIndex = secondIndicesBuffer.[i - middleIdx]
301+
let firstIndex= localIndices.[middleIdx]
302+
let secondIndex = localIndices.[firstLocalLength + localID - middleIdx]
264303
if firstIndex < secondIndex then leftEdge <- middleIdx + 1 else rightEdge <- middleIdx - 1
265304

266305
let boundaryX = rightEdge
267-
let boundaryY = i - leftEdge
306+
let boundaryY = localID - leftEdge
268307

269-
if boundaryX < 0 then
270-
allIndicesBuffer.[i] <- secondIndicesBuffer.[boundaryY]
271-
allValuesBuffer.[i] <- secondValuesBuffer.[boundaryY]
272-
elif boundaryY < 0 then
273-
allIndicesBuffer.[i] <- firstIndicesBuffer.[boundaryX]
274-
allValuesBuffer.[i] <- firstValuesBuffer.[boundaryX]
308+
let isValidX = boundaryX >= 0
309+
let isValidY = boundaryY >= 0
310+
311+
let mutable fstIdx = uint64 0
312+
if isValidX then fstIdx <- localIndices.[boundaryX]
313+
314+
let mutable sndIdx = uint64 0
315+
if isValidY then sndIdx <- localIndices.[firstLocalLength + boundaryY]
316+
317+
if not isValidX || isValidY && fstIdx < sndIdx then
318+
allIndicesBuffer.[i] <- sndIdx
319+
allValuesBuffer.[i] <- secondValuesBuffer.[i - localID - beginIdx + boundaryY]
275320
else
276-
let firstIndex = firstIndicesBuffer.[boundaryX]
277-
let secondIndex = secondIndicesBuffer.[boundaryY]
278-
if firstIndex < secondIndex then
279-
allIndicesBuffer.[i] <- secondIndex
280-
allValuesBuffer.[i] <- secondValuesBuffer.[boundaryY]
281-
else
282-
allIndicesBuffer.[i] <- firstIndex
283-
allValuesBuffer.[i] <- firstValuesBuffer.[boundaryX]
321+
allIndicesBuffer.[i] <- fstIdx
322+
allValuesBuffer.[i] <- firstValuesBuffer.[beginIdx + boundaryX]
284323
@>
285324

286325
let createSortedConcatenation =
@@ -311,9 +350,13 @@ and COOMatrix<'a when 'a : struct and 'a : equality>(rowCount: int, columnCount:
311350

312351
if i < allIndicesLength - 1 && allIndicesBuffer.[i] = allIndicesBuffer.[i + 1] then
313352
auxiliaryArrayBuffer.[i + 1] <- 0
314-
let localResultBuffer = (%plus) allValuesBuffer.[i] allValuesBuffer.[i + 1]
353+
354+
//Do not drop explicit zeroes
355+
allValuesBuffer.[i] <- (%plus) allValuesBuffer.[i] allValuesBuffer.[i + 1]
356+
315357
//Drop explicit zeroes
316-
if localResultBuffer = zero then auxiliaryArrayBuffer.[i] <- 0 else allValuesBuffer.[i] <- localResultBuffer
358+
//let localResultBuffer = (%plus) allValuesBuffer.[i] allValuesBuffer.[i + 1]
359+
//if localResultBuffer = zero then auxiliaryArrayBuffer.[i] <- 0 else allValuesBuffer.[i] <- localResultBuffer
317360
@>
318361

319362
let fillAuxiliaryArray =
@@ -343,7 +386,7 @@ and COOMatrix<'a when 'a : struct and 'a : equality>(rowCount: int, columnCount:
343386
let i = ndRange.GlobalID0
344387

345388
if i < auxiliaryArrayLength && auxiliaryArrayBuffer.[i] = 1 then
346-
let index = prefixSumArrayBuffer.[i] - 1
389+
let index = prefixSumArrayBuffer.[i]
347390

348391
resultIndicesBuffer.[index] <- allIndicesBuffer.[i]
349392
resultValuesBuffer.[index] <- allValuesBuffer.[i]
@@ -354,7 +397,7 @@ and COOMatrix<'a when 'a : struct and 'a : equality>(rowCount: int, columnCount:
354397

355398
let createUnion =
356399
opencl {
357-
let! prefixSumArray = Toolbox.prefixSum2 auxiliaryArray
400+
let! prefixSumArray = prefixSum3 auxiliaryArray
358401
let binder kernelP =
359402
let ndRange = _1D(workSize auxiliaryArray.Length, workGroupSize)
360403
kernelP

src/GraphBLAS-sharp/Toolbox.fs

Lines changed: 110 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,9 @@ open FSharp.Quotations.Evaluator
99
open Brahma.FSharp.OpenCL.WorkflowBuilder.Basic
1010
open Brahma.FSharp.OpenCL.WorkflowBuilder.Evaluation
1111

12-
module internal Toolbox =
12+
module (*internal*) Toolbox =
1313

14-
let internal workGroupSize = 128
14+
let internal workGroupSize = 256
1515
let internal workSize n =
1616
let m = n - 1
1717
m - m % workGroupSize + workGroupSize
@@ -144,3 +144,111 @@ module internal Toolbox =
144144

145145
return (fst arrays)
146146
}
147+
148+
let rec (*internal*) prefixSum3
149+
(inputArray: int[]) =
150+
151+
let outputArray = Array.copy inputArray
152+
let outputArrayLength = outputArray.Length
153+
let workGroupSize = workGroupSize
154+
155+
let scan =
156+
<@
157+
fun (ndRange: _1D)
158+
(resultBuffer: int[])
159+
(resultLength: int)
160+
(verticesBuffer: int[]) ->
161+
162+
let resultLocalBuffer = localArray<int> workGroupSize
163+
let i = ndRange.GlobalID0
164+
let localID = ndRange.LocalID0
165+
166+
if i < resultLength then resultLocalBuffer.[localID] <- resultBuffer.[i] else resultLocalBuffer.[localID] <- 0
167+
168+
let mutable step = 2
169+
while step <= workGroupSize do
170+
barrier ()
171+
if localID < workGroupSize / step then
172+
let i = step * (localID + 1) - 1
173+
resultLocalBuffer.[i] <- resultLocalBuffer.[i] + resultLocalBuffer.[i - (step >>> 1)]
174+
step <- step <<< 1
175+
barrier ()
176+
177+
if localID = workGroupSize - 1 then
178+
verticesBuffer.[i / workGroupSize] <- resultLocalBuffer.[localID]
179+
resultLocalBuffer.[localID] <- 0
180+
181+
step <- workGroupSize
182+
while step > 1 do
183+
barrier ()
184+
if localID < workGroupSize / step then
185+
let i = step * (localID + 1) - 1
186+
let j = i - (step >>> 1)
187+
188+
let tmp = resultLocalBuffer.[i]
189+
resultLocalBuffer.[i] <- resultLocalBuffer.[i] + resultLocalBuffer.[j]
190+
resultLocalBuffer.[j] <- tmp
191+
step <- step >>> 1
192+
barrier ()
193+
194+
if i < resultLength then resultBuffer.[i] <- resultLocalBuffer.[localID]
195+
@>
196+
197+
let scan array length vertices =
198+
opencl {
199+
let binder kernelP =
200+
let ndRange = _1D(workSize length, workGroupSize)
201+
kernelP
202+
ndRange
203+
array
204+
length
205+
vertices
206+
do! RunCommand scan binder
207+
}
208+
209+
let update =
210+
<@
211+
fun (ndRange: _1D)
212+
(resultBuffer: int[])
213+
(resultLength: int)
214+
(verticesBuffer: int[])
215+
(bunchLength: int) ->
216+
217+
let i = ndRange.GlobalID0
218+
if i < resultLength then
219+
resultBuffer.[i] <- resultBuffer.[i] + verticesBuffer.[i / bunchLength]
220+
@>
221+
222+
let update vertices depth =
223+
opencl {
224+
let binder kernelP =
225+
let ndRange = _1D(workSize outputArrayLength, workGroupSize)
226+
kernelP
227+
ndRange
228+
outputArray
229+
outputArrayLength
230+
vertices
231+
depth
232+
do! RunCommand update binder
233+
}
234+
235+
let firstVertices = Array.zeroCreate <| (workSize outputArrayLength) / workGroupSize
236+
let secondVertices = Array.zeroCreate <| (workSize outputArrayLength) / workGroupSize
237+
let mutable verticesArrays = firstVertices, secondVertices
238+
let swap (a, b) = (b, a)
239+
240+
opencl {
241+
do! scan outputArray outputArrayLength <| fst verticesArrays
242+
243+
let mutable verticesLength = (outputArrayLength - 1) / workGroupSize + 1
244+
let mutable bunchLength = workGroupSize
245+
while verticesLength > 1 do
246+
do! scan (fst verticesArrays) verticesLength (snd verticesArrays)
247+
do! update (fst verticesArrays) bunchLength
248+
249+
bunchLength <- bunchLength * workGroupSize
250+
verticesArrays <- swap verticesArrays
251+
verticesLength <- (verticesLength - 1) / workGroupSize + 1
252+
253+
return outputArray
254+
}

0 commit comments

Comments
 (0)