问题
Found this coded mergesort on http://www.explainth.at/en/delphi/dsort.shtml (site down but try wayback machine or this site: http://read.pudn.com/downloads192/sourcecode/delphi_control/901147/Sorts.pas__.htm) but essentially the array defined was not for an array of string.
type TSortArray = array[0..8191] of Double;
I want to pass an array of string that would possibly eliminate duplicates (this would be Union?) and preserve original order if possible for later resorting it back to original index position minus the duplicates of course (original index) so array can be passed back for further processing. I am using very large files of strings with millions of strings (14 to 30 million) so TStringList
is not an option. Best option for these large files is to use arrays of string or arrays of records (or maybe single linked list??) and sort with stable algorithm made for large amount of data.
- How can I change this to take array of string?
- How can it be further modified to delete or at least mark duplicates?
- Is it possible to store original index number to place back strings in original position?
- Are arrays of string or arrays of record better for large number of strings as compared to a single linked list?
Questions are listed in order of importance so if you answer question number 1 only that is fine. Thank you in advance for all your input.
procedure MergeSort(var Vals:TSortArray;ACount:Integer);
var AVals:TSortArray;
procedure Merge(ALo,AMid,AHi:Integer);
var i,j,k,m:Integer;
begin
i:=0;
for j:=ALo to AMid do
begin
AVals[i]:=Vals[j];
inc(i);
//copy lower half or Vals into temporary array AVals
end;
i:=0;j:=AMid + 1;k:=ALo;//j could be undefined after the for loop!
while ((k < j) and (j <= AHi)) do
if (AVals[i] < Vals[j]) then
begin
Vals[k]:=AVals[i];
inc(i);inc(k);
end else
begin
Vals[k]:=Vals[j];
inc(k);inc(j);
end;
{locate next greatest value in Vals or AVals and copy it to the
right position.}
for m:=k to j - 1 do
begin
Vals[m]:=AVals[i];
inc(i);
end;
//copy back any remaining, unsorted, elements
end;
procedure PerformMergeSort(ALo,AHi:Integer);
var AMid:Integer;
begin
if (ALo < AHi) then
begin
AMid:=(ALo + AHi) shr 1;
PerformMergeSort(ALo,AMid);
PerformMergeSort(AMid + 1,AHi);
Merge(ALo,AMid,AHi); <==== passing the array as string causes AV breakdown here
end;
end;
begin
SetLength(AVals, ACount);
PerformMergeSort(0,ACount - 1);
end;
回答1:
Answer to the second question: Mergesort modification with duplicate deleting. Should work for strings.
//returns new valid length
function MergeSortRemoveDuplicates(var Vals: array of Integer):Integer;
var
AVals: array of Integer;
//returns index of the last valid element
function Merge(I0, I1, J0, J1: Integer):Integer;
var
i, j, k, LC:Integer;
begin
LC := I1 - I0;
for i := 0 to LC do
AVals[i]:=Vals[i + I0];
//copy lower half or Vals into temporary array AVals
k := I0;
i := 0;
j := J0;
while ((i <= LC) and (j <= J1)) do
if (AVals[i] < Vals[j]) then begin
Vals[k] := AVals[i];
inc(i);
inc(k);
end else if (AVals[i] > Vals[j]) then begin
Vals[k]:=Vals[j];
inc(k);
inc(j);
end else begin //duplicate
Vals[k] := AVals[i];
inc(i);
inc(j);
inc(k);
end;
//copy the rest
while i <= LC do begin
Vals[k] := AVals[i];
inc(i);
inc(k);
end;
if k <> j then
while j <= J1 do begin
Vals[k]:=Vals[j];
inc(k);
inc(j);
end;
Result := k - 1;
end;
//returns index of the last valid element
function PerformMergeSort(ALo, AHi:Integer): Integer; //returns
var
AMid, I1, J1:Integer;
begin
//It would be wise to use Insertion Sort when (AHi - ALo) is small (about 32-100)
if (ALo < AHi) then
begin
AMid:=(ALo + AHi) shr 1;
I1 := PerformMergeSort(ALo, AMid);
J1 := PerformMergeSort(AMid + 1, AHi);
Result := Merge(ALo, I1, AMid + 1, J1);
end else
Result := ALo;
end;
begin
SetLength(AVals, Length(Vals) div 2 + 1);
Result := 1 + PerformMergeSort(0, High(Vals));
end;
//short test
var
A: array of Integer;
i, NewLen: Integer;
begin
Randomize;
SetLength(A, 12);
for i := 0 to High(A) do
A[i] := Random(10);
NewLen := MergeSortRemoveDuplicates(A);
SetLength(A, NewLen);
for i := 0 to High(A) do
Memo1.Lines.Add(IntToStr(A[i]))
end;
Simple modification for strings:
function MergeSortRemoveDuplicates(var Vals: array of String):Integer;
var
AVals: array of String;
and test case:
var
List: TStringList;
Arr: array of string;
i, n: Integer;
begin
with TStringList.Create do try
LoadFromFile('F:\m2.txt'); //contains some equal strings
SetLength(Arr, Count);
for i := 0 to Count - 1 do
Arr[i] := Strings[i];
finally
Free
end;
n := MergeSortRemoveDuplicates(Arr);
for i := 0 to n - 1 do
Memo1.Lines.Add(Arr[i]);
end;
回答2:
You'd need to modify the declaration TSortArray from array of double to array of string (or array of MyRecord)
The comparison routines in the Merge nested proc needs to be made compatible for strings. Check for anywhere that determines whether AVal[x] < / > AVal[y]. Delphi has procedures for this (AnsiCompareText / AnsiCompareStr depending on whether you want case-sensitivity)
That should work, but if you hadn't done this in your earlier attempts then Delphi should have complained about type mismatches rather than giving an AV, so there may be something else going on
I think duplicate checking should be done post-sort - it only requires one scan through of the data
If you want to store original index data then you will probably need to use an array of record (data: string; OriginalIndex: integer). Code in the Merge procedure then needs to be modified to pass Vals[x].Data to comparison routines. Filling the OriginalIndex values will be a quick scan before calling the Merge procedure
Not 100% sure, to be honest - it's easier to move large contiguous chunks of data with linked lists than with arrays, and arrays don't need messing about with pointers. If your dataset is sufficiently large you may even need to resort to streaming to disk which is likely to drive your choice more than either of those points.
来源:https://stackoverflow.com/questions/12673633/delphi-mergesort-for-string-arrays