Skip to content

Commit 248ef9e

Browse files
authored
Add benchmark project, some small optimizations (#39)
* Add benchmark project, some small optimizations * Use .NET 8 SDK in build pipeline * Update unit test project to .NET 8 * Make GetProfile protected internal
1 parent f2b0321 commit 248ef9e

10 files changed

Lines changed: 161 additions & 21 deletions

File tree

Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
using BenchmarkDotNet.Attributes;
2+
3+
namespace F23.StringSimilarity.Benchmarks;
4+
5+
[MemoryDiagnoser]
6+
public class Benchmarks
7+
{
8+
[Benchmark]
9+
public void Cosine()
10+
{
11+
var cosine = new Cosine();
12+
_ = cosine.Distance("hello", "world");
13+
}
14+
15+
[Benchmark]
16+
public void Damerau()
17+
{
18+
var damerau = new Damerau();
19+
_ = damerau.Distance("hello", "world");
20+
}
21+
22+
[Benchmark]
23+
public void Jaccard()
24+
{
25+
var jaccard = new Jaccard();
26+
_ = jaccard.Distance("hello", "world");
27+
}
28+
29+
[Benchmark]
30+
public void JaroWinkler()
31+
{
32+
var jaro = new JaroWinkler();
33+
_ = jaro.Distance("hello", "world");
34+
}
35+
36+
[Benchmark]
37+
public void Levenshtein()
38+
{
39+
var levenshtein = new Levenshtein();
40+
_ = levenshtein.Distance("hello", "world");
41+
}
42+
43+
[Benchmark]
44+
public void LongestCommonSubsequence()
45+
{
46+
var lcs = new LongestCommonSubsequence();
47+
_ = lcs.Distance("hello", "world");
48+
}
49+
50+
[Benchmark]
51+
public void MetricLCS()
52+
{
53+
var metricLcs = new MetricLCS();
54+
_ = metricLcs.Distance("hello", "world");
55+
}
56+
57+
[Benchmark]
58+
public void NGram()
59+
{
60+
var ngram = new NGram();
61+
_ = ngram.Distance("hello", "world");
62+
}
63+
64+
[Benchmark]
65+
public void NormalizedLevenshtein()
66+
{
67+
var normalizedLevenshtein = new NormalizedLevenshtein();
68+
_ = normalizedLevenshtein.Distance("hello", "world");
69+
}
70+
71+
[Benchmark]
72+
public void OptimalStringAlignment()
73+
{
74+
var osa = new OptimalStringAlignment();
75+
_ = osa.Distance("hello", "world");
76+
}
77+
78+
[Benchmark]
79+
public void QGram()
80+
{
81+
var qGram = new QGram();
82+
_ = qGram.Distance("hello", "world");
83+
}
84+
85+
[Benchmark]
86+
public void RatcliffObershelp()
87+
{
88+
var ratcliffObershelp = new RatcliffObershelp();
89+
_ = ratcliffObershelp.Distance("hello", "world");
90+
}
91+
92+
[Benchmark]
93+
public void SorensenDice()
94+
{
95+
var sorensenDice = new SorensenDice();
96+
_ = sorensenDice.Distance("hello", "world");
97+
}
98+
99+
[Benchmark]
100+
public void WeightedLevenshtein()
101+
{
102+
var weightedLevenshtein = new WeightedLevenshtein(new ExampleCharSub());
103+
_ = weightedLevenshtein.Distance("hello", "world");
104+
}
105+
106+
private class ExampleCharSub : ICharacterSubstitution
107+
{
108+
public double Cost(char c1, char c2)
109+
{
110+
// The cost for substituting 't' and 'r' is considered smaller as these 2 are located next to each other on a keyboard
111+
if (c1 == 't' && c2 == 'r') return 0.5;
112+
113+
// For most cases, the cost of substituting 2 characters is 1.0
114+
return 1.0;
115+
}
116+
}
117+
}
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
<Project Sdk="Microsoft.NET.Sdk">
2+
3+
<PropertyGroup>
4+
<OutputType>Exe</OutputType>
5+
<TargetFramework>net8.0</TargetFramework>
6+
<ImplicitUsings>enable</ImplicitUsings>
7+
<Nullable>enable</Nullable>
8+
</PropertyGroup>
9+
10+
<ItemGroup>
11+
<PackageReference Include="BenchmarkDotNet" Version="0.13.12" />
12+
</ItemGroup>
13+
14+
<ItemGroup>
15+
<ProjectReference Include="..\src\F23.StringSimilarity\F23.StringSimilarity.csproj" />
16+
</ItemGroup>
17+
18+
</Project>
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
using BenchmarkDotNet.Running;
2+
using F23.StringSimilarity.Benchmarks;
3+
4+
BenchmarkRunner.Run<Benchmarks>();

F23.StringSimilarity.sln

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "F23.StringSimilarity", "src
77
EndProject
88
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "F23.StringSimilarity.Tests", "test\F23.StringSimilarity.Tests\F23.StringSimilarity.Tests.csproj", "{68F339E6-278F-4B04-A6ED-422AAD30591F}"
99
EndProject
10+
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "F23.StringSimilarity.Benchmarks", "F23.StringSimilarity.Benchmarks\F23.StringSimilarity.Benchmarks.csproj", "{3A9605B1-820C-43C2-8F9B-72BCA5F5543B}"
11+
EndProject
1012
Global
1113
GlobalSection(SolutionConfigurationPlatforms) = preSolution
1214
Debug|Any CPU = Debug|Any CPU
@@ -21,6 +23,10 @@ Global
2123
{68F339E6-278F-4B04-A6ED-422AAD30591F}.Debug|Any CPU.Build.0 = Debug|Any CPU
2224
{68F339E6-278F-4B04-A6ED-422AAD30591F}.Release|Any CPU.ActiveCfg = Release|Any CPU
2325
{68F339E6-278F-4B04-A6ED-422AAD30591F}.Release|Any CPU.Build.0 = Release|Any CPU
26+
{3A9605B1-820C-43C2-8F9B-72BCA5F5543B}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
27+
{3A9605B1-820C-43C2-8F9B-72BCA5F5543B}.Debug|Any CPU.Build.0 = Debug|Any CPU
28+
{3A9605B1-820C-43C2-8F9B-72BCA5F5543B}.Release|Any CPU.ActiveCfg = Release|Any CPU
29+
{3A9605B1-820C-43C2-8F9B-72BCA5F5543B}.Release|Any CPU.Build.0 = Release|Any CPU
2430
EndGlobalSection
2531
GlobalSection(SolutionProperties) = preSolution
2632
HideSolutionNode = FALSE

src/F23.StringSimilarity/Jaccard.cs

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424

2525
using System;
2626
using System.Collections.Generic;
27+
using System.Linq;
2728
using F23.StringSimilarity.Interfaces;
2829

2930
// ReSharper disable LoopCanBeConvertedToQuery
@@ -83,14 +84,13 @@ public double Similarity(string s1, string s2)
8384
var profile1 = GetProfile(s1);
8485
var profile2 = GetProfile(s2);
8586

86-
var union = new HashSet<string>();
87-
union.UnionWith(profile1.Keys);
88-
union.UnionWith(profile2.Keys);
87+
// SSNET Specific: use LINQ for more optimal distinct count
88+
var unionCount = profile1.Keys.Concat(profile2.Keys).Distinct().Count();
8989

9090
int inter = profile1.Keys.Count + profile2.Keys.Count
91-
- union.Count;
91+
- unionCount;
9292

93-
return 1.0 * inter / union.Count;
93+
return 1.0 * inter / unionCount;
9494
}
9595

9696

src/F23.StringSimilarity/Levenshtein.cs

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,7 @@ public double Distance<T>(ReadOnlySpan<T> s1, ReadOnlySpan<T> s2, int limit)
109109
// create two work vectors of integer distances
110110
int[] v0 = new int[s2.Length + 1];
111111
int[] v1 = new int[s2.Length + 1];
112-
int[] vtemp;
112+
// SSNET: removed unneeded int[] vtemp;
113113

114114
// initialize v0 (the previous row of distances)
115115
// this row is A[0][i]: edit distance for an empty s
@@ -155,9 +155,7 @@ public double Distance<T>(ReadOnlySpan<T> s1, ReadOnlySpan<T> s2, int limit)
155155
// System.arraycopy(v1, 0, v0, 0, v0.length);
156156

157157
// Flip references to current and previous row
158-
vtemp = v0;
159-
v0 = v1;
160-
v1 = vtemp;
158+
(v0, v1) = (v1, v0); // SSNET specific: Swap v0 and v1 using tuples
161159
}
162160

163161
return v0[s2.Length];

src/F23.StringSimilarity/NGram.cs

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@ public double Distance(string s0, string s1)
103103
char[] sa = new char[sl + n - 1];
104104
float[] p; // 'previous' cost array, horizontally
105105
float[] d; // Cost array, horizontally
106-
float[] d2; // Placeholder to assist in swapping p and d
106+
// SSNET removed unneeded: float[] d2; // Placeholder to assist in swapping p and d
107107

108108
// Construct sa with prefix
109109
for (int i1 = 0; i1 < sa.Length; i1++)
@@ -172,9 +172,7 @@ public double Distance(string s0, string s1)
172172
d[i] = Math.Min(Math.Min(d[i - 1] + 1, p[i] + 1), p[i - 1] + ec);
173173
}
174174
// Copy current distance counts to 'previous row' distance counts
175-
d2 = p;
176-
p = d;
177-
d = d2;
175+
(p, d) = (d, p); // SSNET specific: swap p and d using tuples
178176
}
179177

180178
// Our last action in the above loop was to switch d and p, so p now

src/F23.StringSimilarity/ShingleBased.cs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ public abstract class ShingleBased
4141
/// <summary>
4242
/// Pattern for finding multiple following spaces
4343
/// </summary>
44-
private static readonly Regex SPACE_REG = new Regex("\\s+");
44+
private static readonly Regex SPACE_REG = new Regex("\\s+", RegexOptions.Compiled);
4545

4646
/// <summary>
4747
/// </summary>
@@ -59,7 +59,7 @@ protected ShingleBased(int k)
5959

6060
protected ShingleBased() : this(DEFAULT_K) { }
6161

62-
public IDictionary<string, int> GetProfile(string s)
62+
protected internal Dictionary<string, int> GetProfile(string s)
6363
{
6464
var shingles = new Dictionary<string, int>();
6565

@@ -79,7 +79,7 @@ public IDictionary<string, int> GetProfile(string s)
7979
}
8080
}
8181

82-
return new ReadOnlyDictionary<string, int>(shingles);
82+
return shingles;
8383
}
8484
}
8585
}

src/F23.StringSimilarity/WeightedLevenshtein.cs

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
*/
2424

2525
using System;
26+
using System.Threading;
2627
using F23.StringSimilarity.Interfaces;
2728
// ReSharper disable SuggestVarOrType_Elsewhere
2829
// ReSharper disable TooWideLocalVariableScope
@@ -112,7 +113,7 @@ public double Distance(string s1, string s2, double limit)
112113
// create two work vectors of floating point (i.e. weighted) distances
113114
double[] v0 = new double[s2.Length + 1];
114115
double[] v1 = new double[s2.Length + 1];
115-
double[] vtemp;
116+
// SSNET: removed unneeded double[] vtemp;
116117

117118
// initialize v0 (the previous row of distances)
118119
// this row is A[0][i]: edit distance for an empty s1
@@ -166,9 +167,7 @@ public double Distance(string s1, string s2, double limit)
166167
// copy v1 (current row) to v0 (previous row) for next iteration
167168
// System.arraycopy(v1, 0, v0, 0, v0.length);
168169
// Flip references to current and previous row
169-
vtemp = v0;
170-
v0 = v1;
171-
v1 = vtemp;
170+
(v0, v1) = (v1, v0); // SSNET Specific: Swap references using tuples instead of temporary
172171
}
173172

174173
return v0[s2.Length];

test/F23.StringSimilarity.Tests/F23.StringSimilarity.Tests.csproj

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,4 +30,4 @@
3030
<ProjectReference Include="..\..\src\F23.StringSimilarity\F23.StringSimilarity.csproj" />
3131
</ItemGroup>
3232

33-
</Project>
33+
</Project>

0 commit comments

Comments
 (0)