Skip to content

Commit af724b2

Browse files
Optimize CSV reader and writer for performance
Introduces fast path optimizations for CsvDataReader and CsvWriter, including ultra-fast inline parsing for string-only columns and direct writing of numeric types without StringBuilder allocation. Updates benchmarks and documentation to reflect ~25% performance improvement for all-columns read scenarios and reduces gap vs Sylvan. Bumps package version to 1.1.1.
1 parent 1693886 commit af724b2

File tree

5 files changed

+378
-11
lines changed

5 files changed

+378
-11
lines changed

project/Dataplat.Dbatools.Csv/CHANGELOG.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
77

88
## [Unreleased]
99

10+
## [1.1.1] - 2025-12-04
11+
12+
### Changed
13+
- **~25% performance improvement** for all-columns read scenarios (55ms vs 73ms for 100K rows)
14+
- Added fast conversion path for simple string-only columns, bypassing unnecessary checks
15+
- Added ultra-fast inline parsing path (.NET 8+) that writes directly to output, skipping intermediate buffers
16+
- Gap vs Sylvan reduced from 2.0x to 1.6x for all-columns reads
17+
- Added fast write path for CsvWriter - writes numeric types directly without StringBuilder allocation
18+
1019
## [1.1.0] - 2025-12-04
1120

1221
### Added

project/Dataplat.Dbatools.Csv/Dataplat.Dbatools.Csv.csproj

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77

88
<!-- NuGet Package Metadata -->
99
<PackageId>Dataplat.Dbatools.Csv</PackageId>
10-
<Version>1.1.0</Version>
10+
<Version>1.1.1</Version>
1111
<Authors>Chrissy LeMaire</Authors>
1212
<Company>Dataplat</Company>
1313
<Product>Dataplat.Dbatools.Csv</Product>

project/Dataplat.Dbatools.Csv/README.md

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -49,21 +49,21 @@ Benchmark: 100,000 rows × 10 columns (.NET 8, AVX-512)
4949

5050
| Library | Time (ms) | vs Dataplat |
5151
|---------|-----------|-------------|
52-
| Sep | 19 ms | 3.8x faster |
53-
| Sylvan | 29 ms | 2.5x faster |
54-
| **Dataplat** | **74 ms** | **baseline** |
55-
| CsvHelper | 76 ms | ~same |
56-
| LumenWorks | 433 ms | **5.9x slower** |
52+
| Sep | 18 ms | 3.7x faster |
53+
| Sylvan | 27 ms | 2.5x faster |
54+
| **Dataplat** | **67 ms** | **baseline** |
55+
| CsvHelper | 76 ms | 1.1x slower |
56+
| LumenWorks | 395 ms | **5.9x slower** |
5757

5858
**All columns read (full row processing):**
5959

6060
| Library | Time (ms) | vs Dataplat |
6161
|---------|-----------|-------------|
62-
| Sep | 35 ms | 2.1x faster |
63-
| Sylvan | 37 ms | 2.0x faster |
64-
| **Dataplat** | **73 ms** | **baseline** |
65-
| CsvHelper | 101 ms | 1.4x slower |
66-
| LumenWorks | 100 ms | 1.4x slower |
62+
| Sep | 30 ms | 1.8x faster |
63+
| Sylvan | 35 ms | 1.6x faster |
64+
| **Dataplat** | **55 ms** | **baseline** |
65+
| CsvHelper | 97 ms | 1.8x slower |
66+
| LumenWorks | 102 ms | 1.9x slower |
6767

6868
### Understanding the performance tradeoffs
6969

project/dbatools/Csv/Reader/CsvDataReader.cs

Lines changed: 254 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,10 @@ public sealed class CsvDataReader : IDataReader
118118
private System.Buffers.SearchValues<char> _fieldTerminators;
119119
#endif
120120

121+
// Fast path optimization flags - determined during initialization
122+
private bool _useFastConversion; // True when simple string-only conversion can be used
123+
private bool _useFastParsing; // True when ultra-fast inline parsing can be used
124+
121125
#endregion
122126

123127
#region Field Info Structure
@@ -466,6 +470,9 @@ private void Initialize()
466470
// Cache converters for each column to avoid per-row registry lookups
467471
CacheColumnConverters();
468472

473+
// Determine if we can use fast path optimizations
474+
InitializeFastPathOptimizations();
475+
469476
// Prepare converted values array
470477
_convertedValues = new object[_columns.Count + _staticColumns.Count];
471478

@@ -489,6 +496,56 @@ private void CacheColumnConverters()
489496
}
490497
}
491498

499+
/// <summary>
500+
/// Initializes fast path optimization flags based on options and column configuration.
501+
/// </summary>
502+
private void InitializeFastPathOptimizations()
503+
{
504+
// Check if all columns are strings (no type conversion needed)
505+
bool hasNonStringColumns = false;
506+
for (int i = 0; i < _columns.Count; i++)
507+
{
508+
if (_columns[i].DataType != typeof(string) || _columns[i].CachedConverter != null)
509+
{
510+
hasNonStringColumns = true;
511+
break;
512+
}
513+
}
514+
515+
// Static columns always need conversion (they compute values)
516+
if (_staticColumns.Count > 0)
517+
{
518+
hasNonStringColumns = true;
519+
}
520+
521+
// Determine if we can use the fast conversion path:
522+
// - No trimming options
523+
// - No null value configured
524+
// - No DistinguishEmptyFromNull
525+
// - No UseColumnDefaults
526+
// - No static columns
527+
// - All columns are strings
528+
_useFastConversion = !hasNonStringColumns
529+
&& _options.TrimmingOptions == ValueTrimmingOptions.None
530+
&& _options.NullValue == null
531+
&& !_options.DistinguishEmptyFromNull
532+
&& !_options.UseColumnDefaults
533+
&& _staticColumns.Count == 0;
534+
535+
// Determine if we can use the ultra-fast inline parsing path:
536+
// - Single-character delimiter
537+
// - No quote normalization
538+
// - No comment character
539+
// - No parallel processing
540+
// All of the above plus fast conversion conditions
541+
_useFastParsing = _useFastConversion
542+
&& _singleCharDelimiter
543+
&& !_options.NormalizeQuotes
544+
&& _options.Comment == '\0'
545+
&& !_options.EnableParallelProcessing
546+
&& _options.QuoteMode != QuoteMode.Lenient;
547+
}
548+
492549
private void InitializeColumnsFromFirstDataRow()
493550
{
494551
// Read the first data row to determine column count
@@ -1507,6 +1564,14 @@ private bool ReadBufferedFirstLine()
15071564
/// </summary>
15081565
private bool ReadSequentialDirect()
15091566
{
1567+
#if NET8_0_OR_GREATER
1568+
// Ultra-fast path: inline parsing directly to _convertedValues for simple CSV
1569+
if (_useFastParsing && _isInitialized)
1570+
{
1571+
return ReadSequentialUltraFast();
1572+
}
1573+
#endif
1574+
15101575
while (true)
15111576
{
15121577
try
@@ -1553,6 +1618,155 @@ private bool ReadSequentialDirect()
15531618
}
15541619
}
15551620

1621+
#if NET8_0_OR_GREATER
1622+
/// <summary>
1623+
/// Ultra-fast inline parsing for simple CSV files (no quotes, no special options).
1624+
/// Writes directly to _convertedValues, skipping all intermediate buffers.
1625+
/// </summary>
1626+
[MethodImpl(MethodImplOptions.AggressiveOptimization)]
1627+
private bool ReadSequentialUltraFast()
1628+
{
1629+
char delimChar = _delimiterFirstChar;
1630+
char quoteChar = _options.Quote;
1631+
int columnCount = _columns.Count;
1632+
var values = _convertedValues;
1633+
1634+
while (true)
1635+
{
1636+
// Ensure we have data
1637+
if (_bufferPosition >= _bufferLength)
1638+
{
1639+
if (!RefillBuffer())
1640+
{
1641+
_currentRecord = null;
1642+
return false;
1643+
}
1644+
}
1645+
1646+
// Skip empty lines
1647+
while (_bufferPosition < _bufferLength)
1648+
{
1649+
char c = _buffer[_bufferPosition];
1650+
if (c == '\r')
1651+
{
1652+
_bufferPosition++;
1653+
_currentLineNumber++;
1654+
if (_bufferPosition < _bufferLength && _buffer[_bufferPosition] == '\n')
1655+
_bufferPosition++;
1656+
continue;
1657+
}
1658+
if (c == '\n')
1659+
{
1660+
_bufferPosition++;
1661+
_currentLineNumber++;
1662+
continue;
1663+
}
1664+
break; // Found start of record
1665+
}
1666+
1667+
if (_bufferPosition >= _bufferLength)
1668+
continue; // Need more data
1669+
1670+
// Parse the record directly into _convertedValues
1671+
_currentRecordIndex++;
1672+
int fieldIndex = 0;
1673+
1674+
while (fieldIndex < columnCount)
1675+
{
1676+
if (_bufferPosition >= _bufferLength)
1677+
{
1678+
// Buffer exhausted mid-record - fall back to standard path
1679+
_currentRecordIndex--;
1680+
_useFastParsing = false;
1681+
return ReadSequentialDirect();
1682+
}
1683+
1684+
char c = _buffer[_bufferPosition];
1685+
1686+
// Check for quoted field - fall back to standard path
1687+
if (c == quoteChar)
1688+
{
1689+
_currentRecordIndex--;
1690+
_useFastParsing = false;
1691+
return ReadSequentialDirect();
1692+
}
1693+
1694+
int fieldStart = _bufferPosition;
1695+
1696+
// Use SIMD to find delimiter or newline
1697+
ReadOnlySpan<char> remaining = _buffer.AsSpan(_bufferPosition, _bufferLength - _bufferPosition);
1698+
int idx = remaining.IndexOfAny(_fieldTerminators);
1699+
1700+
if (idx < 0)
1701+
{
1702+
// No terminator found - fall back to standard path
1703+
_currentRecordIndex--;
1704+
_useFastParsing = false;
1705+
return ReadSequentialDirect();
1706+
}
1707+
1708+
_bufferPosition += idx;
1709+
c = _buffer[_bufferPosition];
1710+
1711+
// Create field string
1712+
int sourceIndex = _columns[fieldIndex].SourceIndex;
1713+
if (sourceIndex == fieldIndex) // Common case: sequential columns
1714+
{
1715+
int length = _bufferPosition - fieldStart;
1716+
if (length == 0)
1717+
{
1718+
values[fieldIndex] = DBNull.Value;
1719+
}
1720+
else
1721+
{
1722+
values[fieldIndex] = new string(_buffer, fieldStart, length);
1723+
}
1724+
}
1725+
else
1726+
{
1727+
// Column mapping is non-trivial - fall back
1728+
_currentRecordIndex--;
1729+
_useFastParsing = false;
1730+
return ReadSequentialDirect();
1731+
}
1732+
1733+
if (c == delimChar)
1734+
{
1735+
_bufferPosition++; // Skip delimiter
1736+
fieldIndex++;
1737+
}
1738+
else // c == '\r' || c == '\n'
1739+
{
1740+
// End of record - skip newline
1741+
if (c == '\r')
1742+
{
1743+
_bufferPosition++;
1744+
if (_bufferPosition < _bufferLength && _buffer[_bufferPosition] == '\n')
1745+
_bufferPosition++;
1746+
}
1747+
else
1748+
{
1749+
_bufferPosition++;
1750+
}
1751+
fieldIndex++;
1752+
break;
1753+
}
1754+
}
1755+
1756+
// Fill remaining columns with DBNull
1757+
while (fieldIndex < columnCount)
1758+
{
1759+
values[fieldIndex] = DBNull.Value;
1760+
fieldIndex++;
1761+
}
1762+
1763+
_currentRecord = _recordBuffer;
1764+
_currentLineNumber++;
1765+
return true;
1766+
}
1767+
}
1768+
#endif
1769+
15561770
/// <summary>
15571771
/// Handles parse errors consistently for both parsing paths.
15581772
/// </summary>
@@ -1681,6 +1895,14 @@ private void HandleFieldCountMismatch(string line, int expectedCount)
16811895

16821896
private void ConvertCurrentRecord()
16831897
{
1898+
// Fast path: all columns are strings with no special handling needed
1899+
if (_useFastConversion)
1900+
{
1901+
ConvertCurrentRecordFast();
1902+
return;
1903+
}
1904+
1905+
// Standard path with all options supported
16841906
for (int i = 0; i < _columns.Count; i++)
16851907
{
16861908
var column = _columns[i];
@@ -1761,6 +1983,35 @@ private void ConvertCurrentRecord()
17611983
}
17621984
}
17631985

1986+
/// <summary>
1987+
/// Fast conversion path for simple string-only columns with no special handling.
1988+
/// This avoids all the per-column checks and branching.
1989+
/// </summary>
1990+
[MethodImpl(MethodImplOptions.AggressiveInlining)]
1991+
private void ConvertCurrentRecordFast()
1992+
{
1993+
int columnCount = _columns.Count;
1994+
var record = _currentRecord;
1995+
var values = _convertedValues;
1996+
1997+
// Direct copy of string values - no conversion, trimming, or null handling
1998+
for (int i = 0; i < columnCount; i++)
1999+
{
2000+
int sourceIndex = _columns[i].SourceIndex;
2001+
string rawValue = sourceIndex < record.Length ? record[sourceIndex] : null;
2002+
2003+
// Empty strings become DBNull for consistency with database behavior
2004+
if (string.IsNullOrEmpty(rawValue))
2005+
{
2006+
values[i] = DBNull.Value;
2007+
}
2008+
else
2009+
{
2010+
values[i] = rawValue;
2011+
}
2012+
}
2013+
}
2014+
17642015
private object ConvertValue(string value, CsvColumn column)
17652016
{
17662017
if (column.DataType == typeof(string))
@@ -3608,6 +3859,9 @@ public void SetColumnType(string columnName, Type type)
36083859
if (type != typeof(string))
36093860
{
36103861
_columns[i].CachedConverter = _columns[i].Converter ?? _options.TypeConverterRegistry?.GetConverter(type);
3862+
// Invalidate fast path when non-string column type is set
3863+
_useFastConversion = false;
3864+
_useFastParsing = false;
36113865
}
36123866
else
36133867
{

0 commit comments

Comments
 (0)