In a Powershell script, I have two data sets that have multiple columns. Not all these columns are shared.
For example, data set 1:
A B XY ZY
- -
I agree with @Matt. Use a hashtable -- something like the below. This should run in m + 2n
rather than mn
time.
Timings on my system
original Solution above
#10 TotalSeconds : 0.07788
#100 TotalSeconds : 0.37937
#1000 TotalSeconds : 5.25092
#10000 TotalSeconds : 242.82018
#20000 TotalSeconds : 906.01584
This definitely looks O(n^2)
Solution Below
#10 TotalSeconds : 0.094
#100 TotalSeconds : 0.425
#1000 TotalSeconds : 3.757
#10000 TotalSeconds : 45.652
#20000 TotalSeconds : 92.918
This looks linear.
Solution
I used three techniques to increase the speed:
--
function Get-Hash{
param(
[Parameter(Mandatory=$true)]
[object]$InputObject,
[Parameter()]
[string[]]$Properties
)
$arr = [System.Collections.ArrayList]::new()
foreach($p in $Properties) { $arr += $InputObject.$($p) }
return ( $arr -join ':' )
}
function Merge-Objects{
param(
[Parameter(Mandatory=$true)]
[object[]]$Dataset1,
[Parameter(Mandatory=$true)]
[object[]]$Dataset2,
[Parameter()]
[string[]]$Properties
)
$results = [System.Collections.ArrayList]::new()
$ds1props = $Dataset1 | gm -MemberType Properties
$ds2props = $Dataset2 | gm -MemberType Properties
$ds1propsNotInDs2Props = $ds1props | ? { $_.Name -notin ($ds2props | Select -ExpandProperty Name) }
$ds2propsNotInDs1Props = $ds2props | ? { $_.Name -notin ($ds1props | Select -ExpandProperty Name) }
$hash = @{}
$Dataset2 | % { $hash.Add( (Get-Hash $_ $Properties), $_) }
foreach ($row in $dataset1) {
$key = Get-Hash $row $Properties
$tempObject = $row.PSObject.Copy()
if ($hash.containskey($key)) {
$r2 = $hash[$key]
$hash.remove($key)
$ds2propsNotInDs1Props | % {
$tempObject | Add-Member -MemberType $_.MemberType -Name $_.Name -Value $r2.$($_.Name)
}
} else {
$ds2propsNotInDs1Props | % {
$tempObject | Add-Member -MemberType $_.MemberType -Name $_.Name -Value $null
}
}
[void]$results.Add($tempObject)
}
foreach ($row in $hash.values ) {
# add missing dataset2 objects and extend
$tempObject = $row.PSObject.Copy()
$ds1propsNotInDs2Props | % {
$tempObject | Add-Member -MemberType $_.MemberType -Name $_.Name -Value $null
}
[void]$results.Add($tempObject)
}
$results
}
########
$dsLength = 10000
$dataset1 = 0..$dsLength | %{
New-Object psobject -Property @{ A=$_ ; B="val$_" ; XY = "foo$_"; ZY ="bar$_" }
}
$dataset2 = ($dsLength/2)..($dsLength*1.5) | %{
New-Object psobject -Property @{ A=$_ ; B="val$_" ; ABC = "foo$_"; GH ="bar$_" }
}
Measure-Command -Expression {
$data = Merge-Objects -Dataset1 $dataset1 -Dataset2 $dataset2 -Properties "A","B"
}