Skip to content

Commit d2e1523

Browse files
authored
[FileFormats.MPS] improve performance of parsing each line (#2940)
1 parent ae95d6a commit d2e1523

2 files changed

Lines changed: 194 additions & 49 deletions

File tree

src/FileFormats/MPS/read.jl

Lines changed: 128 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -156,9 +156,7 @@ end
156156
HEADER_INDICATORS,
157157
)
158158

159-
# `Headers` gets called _alot_ (on every line), so we try very hard to be
160-
# efficient.
161-
function Headers(s)
159+
function parse_single_header(s::AbstractString)
162160
N = length(s)
163161
x = first(s)
164162
if N == 3
@@ -180,7 +178,7 @@ function Headers(s)
180178
return HEADER_ENDATA
181179
end
182180
elseif N == 7
183-
if (x == 'C' || x == 'c') && (uppercase(s) == "COLUMNS")
181+
if (x == 'C' || x == 'c') && uppercase(s) == "COLUMNS"
184182
return HEADER_COLUMNS
185183
elseif (x == 'Q' || x == 'q')
186184
header = uppercase(s)
@@ -190,34 +188,127 @@ function Headers(s)
190188
return HEADER_QMATRIX
191189
end
192190
end
193-
elseif N >= 8
194-
if (x == 'O' || x == 'o') && startswith(uppercase(s), "OBJSENSE")
191+
elseif N == 8
192+
if (x == 'O' || x == 'o') && uppercase(s) == "OBJSENSE"
195193
return HEADER_OBJSENSE
196-
elseif (x == 'Q' || x == 'q')
197-
header = uppercase(s)
198-
if startswith(header, "QCMATRIX")
199-
return HEADER_QCMATRIX
200-
elseif startswith(header, "QSECTION")
201-
return HEADER_QSECTION
194+
end
195+
elseif N == 10
196+
if (x == 'I' || x == 'i') && uppercase(s) == "INDICATORS"
197+
return HEADER_INDICATORS
198+
end
199+
end
200+
return HEADER_UNKNOWN
201+
end
202+
203+
function parse_double_header(s::AbstractString)
204+
N = length(s)
205+
x = first(s)
206+
if N != 8
207+
return HEADER_UNKNOWN
208+
elseif (x == 'O' || x == 'o') && uppercase(s) == "OBJSENSE"
209+
return HEADER_OBJSENSE
210+
elseif (x == 'Q' || x == 'q')
211+
header = uppercase(s)
212+
if startswith(header, "QCMATRIX")
213+
return HEADER_QCMATRIX
214+
elseif startswith(header, "QSECTION")
215+
return HEADER_QSECTION
216+
end
217+
end
218+
return HEADER_UNKNOWN
219+
end
220+
221+
"""
222+
LineToItems(line::String)
223+
224+
Split on any whitespace characters. We can't split only on `' '` because at
225+
least one models in MIPLIB has `\t` as a separator.
226+
227+
This decision assumes that we are parsing a free MPS file, where whitespace is
228+
disallowed in names. If this ever becomes a problem, we could change to the
229+
fixed MPS format, where the files are split at the usual offsets.
230+
231+
This function is a more performant version of:
232+
```julia
233+
LineToItems(line::String) = split(line, r"\\s"; keepempty = false)
234+
```
235+
"""
236+
struct LineToItems
237+
line::String
238+
nfields::Int
239+
fields::NTuple{5,UnitRange{Int}}
240+
241+
function LineToItems(line::String)
242+
nfields, f1, f2, f3, f4, f5 = 0, 0:0, 0:0, 0:0, 0:0, 0:0
243+
start, in_field = -1, false
244+
n = ncodeunits(line)
245+
for i in 1:n
246+
if isspace(line[i])
247+
if in_field
248+
nfields += 1
249+
if nfields == 1
250+
f1 = start:(i-1)
251+
elseif nfields == 2
252+
f2 = start:(i-1)
253+
elseif nfields == 3
254+
f3 = start:(i-1)
255+
elseif nfields == 4
256+
f4 = start:(i-1)
257+
elseif nfields == 5
258+
f5 = start:(i-1)
259+
end
260+
in_field = false
261+
end
262+
elseif !in_field
263+
start = i
264+
in_field = true
202265
end
203-
elseif N == 10
204-
if (x == 'I' || x == 'i') && uppercase(s) == "INDICATORS"
205-
return HEADER_INDICATORS
266+
end
267+
if in_field
268+
nfields += 1
269+
if nfields == 1
270+
f1 = start:n
271+
elseif nfields == 2
272+
f2 = start:n
273+
elseif nfields == 3
274+
f3 = start:n
275+
elseif nfields == 4
276+
f4 = start:n
277+
elseif nfields == 5
278+
f5 = start:n
206279
end
207280
end
281+
return new(line, nfields, (f1, f2, f3, f4, f5))
208282
end
209-
return HEADER_UNKNOWN
210283
end
211284

212-
function line_to_items(line)
213-
# Split on any whitespace characters. We can't split only on `' '` because
214-
# at least one models in MIPLIB has `\t` as a separator.
215-
#
216-
# This decision assumes that we are parsing a free MPS file, where
217-
# whitespace is disallowed in names. If this ever becomes a problem, we
218-
# could change to the fixed MPS format, where the files are split at the
219-
# usual offsets.
220-
return split(line, r"\s"; keepempty = false)
285+
Base.length(x::LineToItems) = x.nfields
286+
287+
function Base.getindex(x::LineToItems, i::Int)
288+
if !(1 <= i <= min(5, x.nfields))
289+
throw(BoundsError(x, i))
290+
end
291+
return SubString(x.line, x.fields[i])
292+
end
293+
294+
Base.iterate(x::LineToItems) = iterate(x, 1)
295+
296+
function Base.iterate(x::LineToItems, i)
297+
if i > x.nfields
298+
return nothing
299+
end
300+
return x[i], i + 1
301+
end
302+
303+
# `parse_header` gets called _alot_ (on every line), so we try very hard to be
304+
# efficient.
305+
function parse_header(s::LineToItems)
306+
if length(s) == 1
307+
return parse_single_header(s[1])
308+
elseif length(s) == 2
309+
return parse_double_header(s[1])
310+
end
311+
return HEADER_UNKNOWN
221312
end
222313

223314
"""
@@ -237,13 +328,12 @@ function Base.read!(io::IO, model::Model{T}) where {T}
237328
if startswith(data.contents, '*')
238329
continue # Lines starting with `*` are comments
239330
end
240-
line = string(strip(data.contents))
241-
if isempty(line)
331+
items = LineToItems(data.contents)
332+
if length(items) == 0
242333
continue # Skip blank lines
243334
end
244-
h = Headers(line)
335+
h = parse_header(items)
245336
if h == HEADER_OBJSENSE
246-
items = line_to_items(line)
247337
if length(items) == 2
248338
sense = uppercase(items[2])
249339
if !(sense in ("MIN", "MAX"))
@@ -258,7 +348,6 @@ function Base.read!(io::IO, model::Model{T}) where {T}
258348
end
259349
continue
260350
elseif h == HEADER_QCMATRIX || h == HEADER_QSECTION
261-
items = line_to_items(line)
262351
if length(items) != 2
263352
_throw_parse_error(
264353
data,
@@ -274,10 +363,8 @@ function Base.read!(io::IO, model::Model{T}) where {T}
274363
continue
275364
end
276365
# Otherwise, carry on with the previous header
277-
# TODO: split into hard fields based on column indices.
278-
items = line_to_items(line)
279366
if header == HEADER_NAME
280-
parse_name_line(data, line)
367+
parse_name_line(data)
281368
elseif header == HEADER_OBJSENSE
282369
sense = uppercase(only(items))
283370
if !(sense in ("MIN", "MAX"))
@@ -490,8 +577,8 @@ end
490577
# NAME
491578
# ==============================================================================
492579

493-
function parse_name_line(data::TempMPSModel, line)
494-
m = match(r"^\s*NAME(.*)"i, line)
580+
function parse_name_line(data::TempMPSModel)
581+
m = match(r"^\s*NAME(.*)"i, data.contents)
495582
if m === nothing
496583
_throw_parse_error(
497584
data,
@@ -506,7 +593,7 @@ end
506593
# ROWS
507594
# ==============================================================================
508595

509-
function parse_rows_line(data::TempMPSModel{T}, items::Vector) where {T}
596+
function parse_rows_line(data::TempMPSModel{T}, items) where {T}
510597
if length(items) < 2
511598
_throw_parse_error(
512599
data,
@@ -619,7 +706,7 @@ function _set_intorg(data::TempMPSModel{T}, column, column_name) where {T}
619706
return
620707
end
621708

622-
function parse_columns_line(data::TempMPSModel{T}, items::Vector) where {T}
709+
function parse_columns_line(data::TempMPSModel{T}, items) where {T}
623710
if length(items) == 3
624711
# [column name] [row name] [value]
625712
column_name, row_name, value = items
@@ -657,7 +744,7 @@ end
657744
# RHS
658745
# ==============================================================================
659746

660-
function parse_single_rhs(data, row_name, value, items::Vector)
747+
function parse_single_rhs(data, row_name, value, items)
661748
if row_name == data.obj_name
662749
data.obj_constant = value
663750
return
@@ -688,7 +775,7 @@ function parse_single_rhs(data, row_name, value, items::Vector)
688775
end
689776

690777
# TODO: handle multiple RHS vectors.
691-
function parse_rhs_line(data::TempMPSModel{T}, items::Vector) where {T}
778+
function parse_rhs_line(data::TempMPSModel{T}, items) where {T}
692779
if length(items) == 3
693780
# [rhs name] [row name] [value]
694781
rhs_name, row_name, value = items
@@ -744,7 +831,7 @@ function parse_single_range(data, row_name, value)
744831
end
745832

746833
# TODO: handle multiple RANGES vectors.
747-
function parse_ranges_line(data::TempMPSModel{T}, items::Vector) where {T}
834+
function parse_ranges_line(data::TempMPSModel{T}, items) where {T}
748835
if length(items) == 3
749836
# [rhs name] [row name] [value]
750837
_, row_name, value = items
@@ -859,7 +946,7 @@ function _parse_single_bound(
859946
end
860947
end
861948

862-
function parse_bounds_line(data::TempMPSModel{T}, items::Vector) where {T}
949+
function parse_bounds_line(data::TempMPSModel{T}, items) where {T}
863950
if length(items) == 3
864951
bound_type, _, column_name = items
865952
_parse_single_bound(data, column_name, bound_type)

test/FileFormats/MPS/test_MPS.jl

Lines changed: 66 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1126,11 +1126,12 @@ function test_parse_name_line()
11261126
" NAME foo" => "foo",
11271127
"" => nothing,
11281128
)
1129+
data.contents = line
11291130
data.name = "_"
11301131
if name === nothing
1131-
@test_throws MPS.ParseError MPS.parse_name_line(data, line)
1132+
@test_throws MPS.ParseError MPS.parse_name_line(data)
11321133
else
1133-
MPS.parse_name_line(data, line)
1134+
MPS.parse_name_line(data)
11341135
@test data.name == name
11351136
end
11361137
end
@@ -1702,12 +1703,12 @@ function test_issue_2792()
17021703
end
17031704

17041705
function test_issue_2797_tab()
1705-
@test MPS.line_to_items("a b") == ["a", "b"]
1706-
@test MPS.line_to_items(" a b") == ["a", "b"]
1707-
@test MPS.line_to_items("a\tb") == ["a", "b"]
1708-
@test MPS.line_to_items("a\tb") == ["a", "b"]
1709-
@test MPS.line_to_items("a\t b") == ["a", "b"]
1710-
@test MPS.line_to_items(" a \t b c ") == ["a", "b", "c"]
1706+
@test MPS.LineToItems("a b") |> collect == ["a", "b"]
1707+
@test MPS.LineToItems(" a b") |> collect == ["a", "b"]
1708+
@test MPS.LineToItems("a\tb") |> collect == ["a", "b"]
1709+
@test MPS.LineToItems("a\tb") |> collect == ["a", "b"]
1710+
@test MPS.LineToItems("a\t b") |> collect == ["a", "b"]
1711+
@test MPS.LineToItems(" a \t b c ") |> collect == ["a", "b", "c"]
17111712
return
17121713
end
17131714

@@ -1728,6 +1729,63 @@ function test_unsupported_objectives()
17281729
return
17291730
end
17301731

1732+
function test_LineToItems()
1733+
for line in [
1734+
"a",
1735+
" a ",
1736+
"a b",
1737+
" a b ",
1738+
"a b c",
1739+
" a b c ",
1740+
"a b c d",
1741+
" a b c d ",
1742+
"a b c d e",
1743+
" a b c d e ",
1744+
]
1745+
@test collect(MPS.LineToItems(line)) ==
1746+
split(line, ' '; keepempty = false)
1747+
end
1748+
items = MPS.LineToItems("a b c d e f g")
1749+
@test length(items) == 7
1750+
@test_throws BoundsError items[0]
1751+
@test items[1] == "a"
1752+
@test_throws BoundsError items[6]
1753+
items = MPS.LineToItems("a b")
1754+
@test length(items) == 2
1755+
@test_throws BoundsError items[3]
1756+
return
1757+
end
1758+
1759+
function test_parse_header()
1760+
for (line, header) in [
1761+
"OBJSENSE" => MPS.HEADER_OBJSENSE,
1762+
"OBJSENSE MAX" => MPS.HEADER_OBJSENSE,
1763+
"ROWS" => MPS.HEADER_ROWS,
1764+
"COLUMNS" => MPS.HEADER_COLUMNS,
1765+
"RHS" => MPS.HEADER_RHS,
1766+
"RANGES" => MPS.HEADER_RANGES,
1767+
"BOUNDS" => MPS.HEADER_BOUNDS,
1768+
"SOS" => MPS.HEADER_SOS,
1769+
"ENDATA" => MPS.HEADER_ENDATA,
1770+
"QUADOBJ" => MPS.HEADER_QUADOBJ,
1771+
"QMATRIX" => MPS.HEADER_QMATRIX,
1772+
"QCMATRIX c" => MPS.HEADER_QCMATRIX,
1773+
"QSECTION c" => MPS.HEADER_QSECTION,
1774+
"INDICATORS" => MPS.HEADER_INDICATORS,
1775+
"" => MPS.HEADER_UNKNOWN,
1776+
"FOO" => MPS.HEADER_UNKNOWN,
1777+
"RHS X" => MPS.HEADER_UNKNOWN,
1778+
"QDMATRIX X" => MPS.HEADER_UNKNOWN,
1779+
"RHS X 1" => MPS.HEADER_UNKNOWN,
1780+
]
1781+
items = MPS.LineToItems(line)
1782+
@test header == MPS.parse_header(items)
1783+
items = MPS.LineToItems(lowercase(line))
1784+
@test header == MPS.parse_header(items)
1785+
end
1786+
return
1787+
end
1788+
17311789
end # TestMPS
17321790

17331791
TestMPS.runtests()

0 commit comments

Comments
 (0)