问题
I'd like to create a 24x20 matrix(8 sections each has 60 cells or 6x10) for visualization of frequency of missing-data occurrence through cycles (=each 480-values) in dataset via panda dataframe and plot it for each columns 'A'
,'B'
,'C'
.
So far I could map the create csv files and mapped the values in right way in matrix and plot it via sns.heatmap(df.isnull())
after changed the missing-data (nan & inf) into 0
or something like 0.01234
which has the least influence on data and in the other hand could be plotted.
Below is my scripts so far:
import numpy as np
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt
def mkdf(ListOf480Numbers):
normalMatrix = np.array_split(ListOf480Numbers,8)
fixMatrix = []
for i in range(8):
lines = np.array_split(normalMatrix[i],6)
newMatrix = [0,0,0,0,0,0]
for j in (1,3,5):
newMatrix[j] = lines[j]
for j in (0,2,4):
newMatrix[j] = lines[j][::-1]
fixMatrix.append(newMatrix)
return fixMatrix
def print_df(fixMatrix):
values = []
for i in range(6):
values.append([*fixMatrix[6][i], *fixMatrix[7][i]])
for i in range(6):
values.append([*fixMatrix[4][i], *fixMatrix[5][i]])
for i in range(6):
values.append([*fixMatrix[2][i], *fixMatrix[3][i]])
for i in range(6):
values.append([*fixMatrix[0][i], *fixMatrix[1][i]])
df = pd.DataFrame(values)
return (df)
dft = pd.read_csv('D:\Feryan.TXT', header=None)
id_set = dft[dft.index % 4 == 0].astype('int').values
A = dft[dft.index % 4 == 1].values
B = dft[dft.index % 4 == 2].values
C = dft[dft.index % 4 == 3].values
data = {'A': A[:,0], 'B': B[:,0], 'C': C[:,0]}
df = pd.DataFrame(data, columns=['A','B','C'], index = id_set[:,0])
nan = np.array(df.isnull())
inf = np.array(df.isnull())
df = df.replace([np.inf, -np.inf], np.nan)
df[np.isinf(df)] = np.nan # convert inf to nan
#dff = df[df.isnull().any(axis=1)] # extract sub data frame
#df = df.fillna(0)
#df = df.replace(0,np.nan)
#next iteration create all plots, change the number of cycles
cycles = int(len(df)/480)
print(cycles)
for cycle in range(3):
count = '{:04}'.format(cycle)
j = cycle * 480
new_value1 = df['A'].iloc[j:j+480]
new_value2 = df['B'].iloc[j:j+480]
new_value3 = df['C'].iloc[j:j+480]
df1 = print_df(mkdf(new_value1))
df2 = print_df(mkdf(new_value2))
df3 = print_df(mkdf(new_value3))
for i in df:
try:
os.mkdir(i)
except:
pass
df1.to_csv(f'{i}/norm{i}{count}.csv', header=None, index=None)
df2.to_csv(f'{i}/norm{i}{count}.csv', header=None, index=None)
df3.to_csv(f'{i}/norm{i}{count}.csv', header=None, index=None)
#plotting all columns ['A','B','C'] in-one-window side by side
fig, ax = plt.subplots(nrows=1, ncols=3 , figsize=(20,10))
plt.subplot(131)
ax = sns.heatmap(df1.isnull(), cbar=False)
ax.axhline(y=6, color='w',linewidth=1.5)
ax.axhline(y=12, color='w',linewidth=1.5)
ax.axhline(y=18, color='w',linewidth=1.5)
ax.axvline(x=10, color='w',linewidth=1.5)
plt.title('Missing-data frequency in A', fontsize=20 , fontweight='bold', color='black', loc='center', style='italic')
plt.axis('off')
plt.subplot(132)
ax = sns.heatmap(df2.isnull(), cbar=False)
ax.axhline(y=6, color='w',linewidth=1.5)
ax.axhline(y=12, color='w',linewidth=1.5)
ax.axhline(y=18, color='w',linewidth=1.5)
ax.axvline(x=10, color='w',linewidth=1.5)
plt.title('Missing-data frequency in B', fontsize=20 , fontweight='bold', color='black', loc='center', style='italic')
plt.axis('off')
plt.subplot(133)
ax = sns.heatmap(df3.isnull(), cbar=False)
ax.axhline(y=6, color='w',linewidth=1.5)
ax.axhline(y=12, color='w',linewidth=1.5)
ax.axhline(y=18, color='w',linewidth=1.5)
ax.axvline(x=10, color='w',linewidth=1.5)
plt.title('Missing-data frequency in C', fontsize=20 , fontweight='bold', color='black', loc='center', style='italic')
plt.axis('off')
plt.suptitle(f'Missing-data visualization', color='yellow', backgroundcolor='black', fontsize=15, fontweight='bold')
plt.subplots_adjust(top=0.92, bottom=0.02, left=0.05, right=0.96, hspace=0.2, wspace=0.2)
fig.text(0.035, 0.93, 'dataset1' , fontsize=19, fontweight='bold', rotation=42., ha='center', va='center',bbox=dict(boxstyle="round",ec=(1., 0.5, 0.5),fc=(1., 0.8, 0.8)))
#fig.tight_layout()
plt.savefig(f'{i}/result{count}.png')
#plt.show()
Problem is I don't know how could I plot frequency of missing-data occurrence correctly to understand in which sections and cells it happen frequently.
Note1 more missing value the color should be brighter and 100% missing data through cycles should be presented by white color and solid black color indicates non-missing-values. there could be a bar chart start from black color 0% to 100% white color.
Note2 I also provide sample text file of dataset for 3 cycles includes few missing data but it could be manually modified and increased : dataset
Expected result should be like below:
回答1:
You can store your nan/inf data in a seperate array that you can add up over the cycles for each nan/inf.
Your arrays always seem to have the same size, so I defined them with a fixed size. You can change that to match your data:
df1MissingDataFrequency = np.zeros((24,20))
Then you can add them up where you get a nan
value (you have already replaced inf
with nan
in your code):
df1MissingDataFrequency = df1MissingDataFrequency + np.isnan(df1).astype(int)
over all your cycles.
You seem to have some problems with your indentation. I don't know if that is only the case for the code you posted here or if that is the same in your actual code, but at the moment you make a new plot each cycle and you redifine df1, df2, df3
for each i
.
With your missing frequency data your code should look like this:
import numpy as np
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt
def mkdf(ListOf480Numbers):
normalMatrix = np.array_split(ListOf480Numbers,8)
fixMatrix = []
for i in range(8):
lines = np.array_split(normalMatrix[i],6)
newMatrix = [0,0,0,0,0,0]
for j in (1,3,5):
newMatrix[j] = lines[j]
for j in (0,2,4):
newMatrix[j] = lines[j][::-1]
fixMatrix.append(newMatrix)
return fixMatrix
def print_df(fixMatrix):
values = []
for i in range(6):
values.append([*fixMatrix[6][i], *fixMatrix[7][i]])
for i in range(6):
values.append([*fixMatrix[4][i], *fixMatrix[5][i]])
for i in range(6):
values.append([*fixMatrix[2][i], *fixMatrix[3][i]])
for i in range(6):
values.append([*fixMatrix[0][i], *fixMatrix[1][i]])
df = pd.DataFrame(values)
return (df)
dft = pd.read_csv('D:/Feryan2.txt', header=None)
id_set = dft[dft.index % 4 == 0].astype('int').values
A = dft[dft.index % 4 == 1].values
B = dft[dft.index % 4 == 2].values
C = dft[dft.index % 4 == 3].values
data = {'A': A[:,0], 'B': B[:,0], 'C': C[:,0]}
df = pd.DataFrame(data, columns=['A','B','C'], index = id_set[:,0])
nan = np.array(df.isnull())
inf = np.array(df.isnull())
df = df.replace([np.inf, -np.inf], np.nan)
df[np.isinf(df)] = np.nan # convert inf to nan
df1MissingDataFrequency = np.zeros((24,20))
df2MissingDataFrequency = np.zeros((24,20))
df3MissingDataFrequency = np.zeros((24,20))
#next iteration create all plots, change the number of cycles
cycles = int(len(df)/480)
print(cycles)
for cycle in range(3):
count = '{:04}'.format(cycle)
j = cycle * 480
new_value1 = df['A'].iloc[j:j+480]
new_value2 = df['B'].iloc[j:j+480]
new_value3 = df['C'].iloc[j:j+480]
df1 = print_df(mkdf(new_value1))
df2 = print_df(mkdf(new_value2))
df3 = print_df(mkdf(new_value3))
for i in df:
try:
os.mkdir(i)
except:
pass
df1.to_csv(f'{i}/norm{i}{count}.csv', header=None, index=None)
df2.to_csv(f'{i}/norm{i}{count}.csv', header=None, index=None)
df3.to_csv(f'{i}/norm{i}{count}.csv', header=None, index=None)
df1MissingDataFrequency = df1MissingDataFrequency + np.isnan(df1).astype(int)
df2MissingDataFrequency = df2MissingDataFrequency + np.isnan(df2).astype(int)
df3MissingDataFrequency = df3MissingDataFrequency + np.isnan(df3).astype(int)
#plotting all columns ['A','B','C'] in-one-window side by side
fig, ax = plt.subplots(nrows=1, ncols=3 , figsize=(10,7))
plt.subplot(131)
ax = sns.heatmap(df1MissingDataFrequency, cbar=False, cmap="gray")
ax.axhline(y=6, color='w',linewidth=1.5)
ax.axhline(y=12, color='w',linewidth=1.5)
ax.axhline(y=18, color='w',linewidth=1.5)
ax.axvline(x=10, color='w',linewidth=1.5)
plt.title('Missing-data frequency in A', fontsize=20 , fontweight='bold', color='black', loc='center', style='italic')
plt.axis('off')
plt.subplot(132)
ax = sns.heatmap(df2MissingDataFrequency, cbar=False, cmap="gray")
ax.axhline(y=6, color='w',linewidth=1.5)
ax.axhline(y=12, color='w',linewidth=1.5)
ax.axhline(y=18, color='w',linewidth=1.5)
ax.axvline(x=10, color='w',linewidth=1.5)
plt.title('Missing-data frequency in B', fontsize=20 , fontweight='bold', color='black', loc='center', style='italic')
plt.axis('off')
plt.subplot(133)
ax = sns.heatmap(df3MissingDataFrequency, cbar=False, cmap="gray")
ax.axhline(y=6, color='w',linewidth=1.5)
ax.axhline(y=12, color='w',linewidth=1.5)
ax.axhline(y=18, color='w',linewidth=1.5)
ax.axvline(x=10, color='w',linewidth=1.5)
plt.title('Missing-data frequency in C', fontsize=20 , fontweight='bold', color='black', loc='center', style='italic')
plt.axis('off')
plt.suptitle(f'Missing-data visualization', color='yellow', backgroundcolor='black', fontsize=15, fontweight='bold')
plt.subplots_adjust(top=0.92, bottom=0.02, left=0.05, right=0.96, hspace=0.2, wspace=0.2)
fig.text(0.035, 0.93, 'dataset1' , fontsize=19, fontweight='bold', rotation=42., ha='center', va='center',bbox=dict(boxstyle="round",ec=(1., 0.5, 0.5),fc=(1., 0.8, 0.8)))
#fig.tight_layout()
plt.savefig(f'{i}/result{count}.png')
#plt.show()
Which gives you the output you want:
EDIT
In the spirit of DRY, I edited your code so you don't have df1, df2, df3, new_values1, ... and you copying and pasting the same things all over. You already loop over i
, so you should use that to actually address the three different columns in your dataframe:
dft = pd.read_csv('C:/Users/frefra/Downloads/Feryan2.txt', header=None).replace([np.inf, -np.inf], np.nan)
id_set = dft[dft.index % 4 == 0].astype('int').values
A = dft[dft.index % 4 == 1].values
B = dft[dft.index % 4 == 2].values
C = dft[dft.index % 4 == 3].values
data = {'A': A[:,0], 'B': B[:,0], 'C': C[:,0]}
df = pd.DataFrame(data, columns=['A','B','C'], index = id_set[:,0])
new_values = []
dfs = []
nan_frequencies = np.zeros((3,24,20))
#next iteration create all plots, change the number of cycles
cycles = int(len(df)/480)
print(cycles)
for cycle in range(cycles):
count = '{:04}'.format(cycle)
j = cycle * 480
for idx,i in enumerate(df):
try:
os.mkdir(i)
except:
pass
new_value = df[i].iloc[j:j+480]
new_values.append(new_value)
dfi = print_df(mkdf(new_value))
dfs.append(dfi)
dfi.to_csv(f'{i}/norm{i}{count}.csv', header=None, index=None)
nan_frequencies[idx] = nan_frequencies[idx] + np.isnan(dfi).astype(int)
#plotting all columns ['A','B','C'] in-one-window side by side
fig, ax = plt.subplots(nrows=1, ncols=3 , figsize=(10,7))
for idx,i in enumerate(df):
plt.subplot(1,3,idx+1)
ax = sns.heatmap(nan_frequencies[idx], cbar=False, cmap="gray")
ax.axhline(y=6, color='w',linewidth=1.5)
ax.axhline(y=12, color='w',linewidth=1.5)
ax.axhline(y=18, color='w',linewidth=1.5)
ax.axvline(x=10, color='w',linewidth=1.5)
plt.title('Missing-data frequency in ' + i, fontsize=20 , fontweight='bold', color='black', loc='center', style='italic')
plt.axis('off')
来源:https://stackoverflow.com/questions/54394457/visualisation-of-missing-data-occurrence-frequency-by-using-seaborn