@@ -251,36 +251,75 @@ and COOMatrix<'a when 'a : struct and 'a : equality>(rowCount: int, columnCount:
251251 let i = ndRange.GlobalID0
252252
253253 if i < allIndicesLength then
254- let f n = if 0 > n + 1 - shortSide then 0 else n + 1 - shortSide
255- let mutable leftEdge = f i
256-
257- let g n = if n > longSide - 1 then longSide - 1 else n
258- let mutable rightEdge = g i
254+ let knots = localArray< int> 2
255+ let localID = ndRange.LocalID0
256+ if localID < 2 then
257+ let mutable x = localID * ( workGroupSize - 1 ) + i - 1
258+ if x >= shortSide + longSide then x <- shortSide + longSide - 1
259+ let diagonalNumber = x
260+
261+ let mutable leftEdge = diagonalNumber + 1 - shortSide
262+ if leftEdge < 0 then leftEdge <- 0
263+
264+ let mutable rightEdge = longSide - 1
265+ if rightEdge > diagonalNumber then rightEdge <- diagonalNumber
266+
267+ while leftEdge <= rightEdge do
268+ let middleIdx = ( leftEdge + rightEdge) / 2
269+ let firstIndex = firstIndicesBuffer.[ middleIdx]
270+ let secondIndex = secondIndicesBuffer.[ diagonalNumber - middleIdx]
271+ if firstIndex < secondIndex then leftEdge <- middleIdx + 1 else rightEdge <- middleIdx - 1
272+
273+ knots.[ localID] <- leftEdge
274+ barrier ()
275+
276+ let beginIdx = knots.[ 0 ] // BANK CONFLICTS?
277+ let endIdx = knots.[ 1 ]
278+ let firstLocalLength = endIdx - beginIdx
279+ let mutable x = workGroupSize - firstLocalLength
280+ if endIdx = longSide then x <- shortSide - i + localID + beginIdx
281+ let secondLocalLength = x
282+
283+ //First indices are from 0 to firstLocalLength - 1 inclusive
284+ //Second indices are from firstLocalLength to firstLocalLength + secondLocalLength - 1 inclusive
285+ let localIndices = localArray< uint64> workGroupSize
286+
287+ if localID < firstLocalLength then
288+ localIndices.[ localID] <- firstIndicesBuffer.[ beginIdx + localID]
289+ if localID < secondLocalLength then
290+ localIndices.[ firstLocalLength + localID] <- secondIndicesBuffer.[ i - beginIdx]
291+ barrier ()
292+
293+ let mutable leftEdge = localID + 1 - secondLocalLength
294+ if leftEdge < 0 then leftEdge <- 0
295+
296+ let mutable rightEdge = firstLocalLength - 1
297+ if rightEdge > localID then rightEdge <- localID
259298
260299 while leftEdge <= rightEdge do
261300 let middleIdx = ( leftEdge + rightEdge) / 2
262- let firstIndex = firstIndicesBuffer .[ middleIdx]
263- let secondIndex = secondIndicesBuffer .[ i - middleIdx]
301+ let firstIndex = localIndices .[ middleIdx]
302+ let secondIndex = localIndices .[ firstLocalLength + localID - middleIdx]
264303 if firstIndex < secondIndex then leftEdge <- middleIdx + 1 else rightEdge <- middleIdx - 1
265304
266305 let boundaryX = rightEdge
267- let boundaryY = i - leftEdge
306+ let boundaryY = localID - leftEdge
268307
269- if boundaryX < 0 then
270- allIndicesBuffer.[ i] <- secondIndicesBuffer.[ boundaryY]
271- allValuesBuffer.[ i] <- secondValuesBuffer.[ boundaryY]
272- elif boundaryY < 0 then
273- allIndicesBuffer.[ i] <- firstIndicesBuffer.[ boundaryX]
274- allValuesBuffer.[ i] <- firstValuesBuffer.[ boundaryX]
308+ let isValidX = boundaryX >= 0
309+ let isValidY = boundaryY >= 0
310+
311+ let mutable fstIdx = uint64 0
312+ if isValidX then fstIdx <- localIndices.[ boundaryX]
313+
314+ let mutable sndIdx = uint64 0
315+ if isValidY then sndIdx <- localIndices.[ firstLocalLength + boundaryY]
316+
317+ if not isValidX || isValidY && fstIdx < sndIdx then
318+ allIndicesBuffer.[ i] <- sndIdx
319+ allValuesBuffer.[ i] <- secondValuesBuffer.[ i - localID - beginIdx + boundaryY]
275320 else
276- let firstIndex = firstIndicesBuffer.[ boundaryX]
277- let secondIndex = secondIndicesBuffer.[ boundaryY]
278- if firstIndex < secondIndex then
279- allIndicesBuffer.[ i] <- secondIndex
280- allValuesBuffer.[ i] <- secondValuesBuffer.[ boundaryY]
281- else
282- allIndicesBuffer.[ i] <- firstIndex
283- allValuesBuffer.[ i] <- firstValuesBuffer.[ boundaryX]
321+ allIndicesBuffer.[ i] <- fstIdx
322+ allValuesBuffer.[ i] <- firstValuesBuffer.[ beginIdx + boundaryX]
284323 @>
285324
286325 let createSortedConcatenation =
@@ -311,9 +350,13 @@ and COOMatrix<'a when 'a : struct and 'a : equality>(rowCount: int, columnCount:
311350
312351 if i < allIndicesLength - 1 && allIndicesBuffer.[ i] = allIndicesBuffer.[ i + 1 ] then
313352 auxiliaryArrayBuffer.[ i + 1 ] <- 0
314- let localResultBuffer = (% plus) allValuesBuffer.[ i] allValuesBuffer.[ i + 1 ]
353+
354+ //Do not drop explicit zeroes
355+ allValuesBuffer.[ i] <- (% plus) allValuesBuffer.[ i] allValuesBuffer.[ i + 1 ]
356+
315357 //Drop explicit zeroes
316- if localResultBuffer = zero then auxiliaryArrayBuffer.[ i] <- 0 else allValuesBuffer.[ i] <- localResultBuffer
358+ //let localResultBuffer = (%plus) allValuesBuffer.[i] allValuesBuffer.[i + 1]
359+ //if localResultBuffer = zero then auxiliaryArrayBuffer.[i] <- 0 else allValuesBuffer.[i] <- localResultBuffer
317360 @>
318361
319362 let fillAuxiliaryArray =
@@ -343,7 +386,7 @@ and COOMatrix<'a when 'a : struct and 'a : equality>(rowCount: int, columnCount:
343386 let i = ndRange.GlobalID0
344387
345388 if i < auxiliaryArrayLength && auxiliaryArrayBuffer.[ i] = 1 then
346- let index = prefixSumArrayBuffer.[ i] - 1
389+ let index = prefixSumArrayBuffer.[ i]
347390
348391 resultIndicesBuffer.[ index] <- allIndicesBuffer.[ i]
349392 resultValuesBuffer.[ index] <- allValuesBuffer.[ i]
@@ -354,7 +397,7 @@ and COOMatrix<'a when 'a : struct and 'a : equality>(rowCount: int, columnCount:
354397
355398 let createUnion =
356399 opencl {
357- let! prefixSumArray = Toolbox.prefixSum2 auxiliaryArray
400+ let! prefixSumArray = prefixSum3 auxiliaryArray
358401 let binder kernelP =
359402 let ndRange = _ 1D( workSize auxiliaryArray.Length, workGroupSize)
360403 kernelP
0 commit comments