首页 > 解决方案 > 如何在 Python 中进行复杂的索引?

问题描述

我已经从 Matlab 过渡到 Python,并且在索引/子集方面遇到了问题。我有两个数据框。一个是CT,另一个是oco。我必须根据纬度/经度从 oco 中提取值,并从 CT 数据帧中提取一些余量。我的 Matlab 代码运行良好,但是当我在 Python 中尝试时,它给出了不同的值。

下面是matlab代码:

clc 
clear 

dataleng = [];

myFolderct = '/Users/farhanmustafa/Documents/analysis2/year2012/';

if ~isdir(myFolderct)
  errorMessage = sprintf('Error: The following folder does not exist:\n%s', myFolder);
  uiwait(warndlg(errorMessage));
  return;
end


dategg = [];
 CTall = [];  SXCT = [];   GO1 = []; 

 
myFolder = '/Users/farhanmustafa/Documents/analysis2/gosat2012/';

if ~isdir(myFolder)
  errorMessage = sprintf('Error: The following folder does not exist:\n%s', myFolder);
  uiwait(warndlg(errorMessage));
  return;
end

filePattern = fullfile(myFolder,'*.nc4');
theFiles = dir(filePattern);
dataleng = [dataleng;length(theFiles)];

for k = 1:1
 baseFileName = theFiles(k).name;
 fullFileName = fullfile(myFolder, baseFileName);
 fprintf(1, 'Now reading %s\n', fullFileName);
 
 ncid = netcdf.open(fullFileName,'NC_NOWRITE');
 varid = netcdf.inqVarID(ncid,'longitude');
 lon = netcdf.getVar(ncid,varid);
 
 
 varid = netcdf.inqVarID(ncid,'latitude');
 lat = netcdf.getVar(ncid,varid);
 
  varid = netcdf.inqVarID(ncid,'xco2'); 
  xco2 = netcdf.getVar(ncid,varid);
 
  
  varid = netcdf.inqVarID(ncid,'xco2_uncertainty'); 
  err = netcdf.getVar(ncid,varid);
  
  varid = netcdf.inqVarID(ncid,'xco2_quality_flag'); 
  qflag = netcdf.getVar(ncid,varid);
  qflagd = double(qflag);
  
  varid = netcdf.inqVarID(ncid,'xco2_averaging_kernel'); 
  KA = netcdf.getVar(ncid,varid);
  KAm = flip(KA);
  
  varid = netcdf.inqVarID(ncid,'pressure_levels'); 
  ocopl = netcdf.getVar(ncid,varid);
  ocoplm = flip(ocopl*100);
  
  
   varid = netcdf.inqVarID(ncid,'pressure_weight'); 
  pw = netcdf.getVar(ncid,varid);
  pwm = flip(pw);
  
   varid = netcdf.inqVarID(ncid,'xco2_apriori'); 
  Xa = netcdf.getVar(ncid,varid);
  Xam = Xa';
  
   varid = netcdf.inqVarID(ncid,'co2_profile_apriori'); 
  Pa = netcdf.getVar(ncid,varid);
  Pam = flip(Pa);
    
  
  varid = netcdf.inqVarID(ncid,'time');
  t = netcdf.getVar(ncid,varid);
  t = double(t)/86400 + datenum([1970 01 01 00 00 00]);
 
  [Y,M,D,H,MM,SS] = datevec(double(t)); 
  
 oco = [t,Y,M,D,H,lon,lat,xco2,err, qflagd];
 
 qflag_filter = (oco(:,10)==0);
 
 t = oco(qflag_filter,1);
 Y = oco(qflag_filter,2);
 M = oco(qflag_filter,3);
 D = oco(qflag_filter,4);
 H = oco(qflag_filter,5);
 lon = oco(qflag_filter,6);
 lat = oco(qflag_filter,7);
 xco2 = oco(qflag_filter,8);
 err = oco(qflag_filter,9);
 qflagd = oco(qflag_filter,10);
  
 oco = [t,Y,M,D,H,lon,lat,xco2,err, qflagd];
 

filePatternct = fullfile(myFolderct,'*.nc');
theFilesct = dir(filePatternct);
dataleng = [dataleng;length(theFilesct)];
baseFileNamect = theFilesct(k).name;
fullFileNamect = fullfile(myFolderct, baseFileNamect);
fprintf(1, 'Now reading %s\n', fullFileNamect);
ncid = netcdf.open(fullFileNamect,'NC_NOWRITE');
  
varid = netcdf.inqVarID(ncid,'longitude')  ;
lonct = netcdf.getVar(ncid,varid);
 
varid = netcdf.inqVarID(ncid,'latitude');
latct = netcdf.getVar(ncid,varid);
 
 varid = netcdf.inqVarID(ncid,'co2');
 ct = double(netcdf.getVar(ncid,varid));
  
 varid = netcdf.inqVarID(ncid,'pressure');
 ctgph = double(netcdf.getVar(ncid,varid));
  
 varid = netcdf.inqVarID(ncid,'time');
 tct = netcdf.getVar(ncid,varid);
 datect = double(tct) + datenum([2000 01 01 00 00 00]);
 [Yct,Mct,Dct,HHct] = datevec(double(tct) + datenum([2000 01 01 00 00 00]));
 
 netcdf.close(ncid)
 
 l1 = find(lonct>=28.5 & lonct<=142.5);
 l2 = find(latct>=-7 & latct<=55);
CT =[];
CTgph = [];
load /Users/farhanmustafa/Documents/MATLAB/grid_arrengiment_index2.mat
  for ii = 1:3:23
   ih = find(HHct==ii); 
 ct1 = ct(l1,l2,1,ih); ct2 = ct(l1,l2,2,ih);ct3 = ct(l1,l2,3,ih);
ct4 = ct(l1,l2,4,ih);ct5 = ct(l1,l2,5,ih);ct6 = ct(l1,l2,6,ih);
ct7 = ct(l1,l2,7,ih);ct8 = ct(l1,l2,8,ih);ct9 = ct(l1,l2,9,ih);
 ct10 = ct(l1,l2,10,ih);ct11 = ct(l1,l2,11,ih);ct12 = ct(l1,l2,12,ih);
ct13 = ct(l1,l2,13,ih);ct14 = ct(l1,l2,14,ih);ct15 = ct(l1,l2,15,ih);
ct16 = ct(l1,l2,16,ih);ct17 = ct(l1,l2,17,ih);ct18 = ct(l1,l2,18,ih);
 ct19 = ct(l1,l2,19,ih);ct20 = ct(l1,l2,20,ih);ct21 = ct(l1,l2,21,ih);
ct22 = ct(l1,l2,22,ih);ct23 = ct(l1,l2,23,ih);ct24 = ct(l1,l2,24,ih);
ct25 = ct(l1,l2,25,ih);

    gph1 = ctgph(l1,l2,1,ih);gph2 = ctgph(l1,l2,2,ih);gph3 = ctgph(l1,l2,3,ih);
gph4 = ctgph(l1,l2,4,ih);gph5 = ctgph(l1,l2,5,ih);gph6 = ctgph(l1,l2,6,ih);
gph7 = ctgph(l1,l2,7,ih);gph8 = ctgph(l1,l2,8,ih);gph9 = ctgph(l1,l2,9,ih);
 gph10 = ctgph(l1,l2,10,ih);gph11 = ctgph(l1,l2,11,ih);gph12 = ctgph(l1,l2,12,ih);
gph13 = ctgph(l1,l2,13,ih);gph14 = ctgph(l1,l2,14,ih);gph15 = ctgph(l1,l2,15,ih);
gph16 = ctgph(l1,l2,16,ih);gph17 = ctgph(l1,l2,17,ih);gph18 = ctgph(l1,l2,18,ih);
 gph19 = ctgph(l1,l2,19,ih);gph20 = ctgph(l1,l2,20,ih);gph21 = ctgph(l1,l2,21,ih);
gph22 = ctgph(l1,l2,22,ih);gph23 = ctgph(l1,l2,23,ih);gph24 = ctgph(l1,l2,24,ih);
gph25 = ctgph(l1,l2,25,ih);



 lonaf= lonct(l1);lataf = latct(l2);
  
ct1 = ct1(:);ct2 = ct2(:);ct3 = ct3(:);ct4 = ct4(:);ct5 = ct5(:);
ct6 = ct6(:);ct7 = ct7(:);ct8 = ct8(:);ct9 = ct9(:);ct10 = ct10(:);
ct11 = ct11(:);ct12 = ct12(:);ct13 = ct13(:);ct14 = ct14(:);ct15 = ct15(:);
ct16 = ct16(:);ct17 = ct17(:);ct18 = ct18(:);ct19 = ct19(:);ct20 = ct20(:);
ct21 = ct21(:);ct22 = ct22(:);ct23 = ct23(:);ct24 = ct24(:);ct25 = ct25(:);
f = ~isnan(gph1);
gph1 = gph1(:);gph2 = gph2(:);gph3 = gph3(:);gph4 = gph4(:);gph5 = gph5(:);
gph6 = gph6(:);gph7 = gph7(:);gph8 = gph8(:);gph9 = gph9(:);gph10 = gph10(:);
gph11 = gph11(:);gph12 = gph12(:);gph13 = gph13(:);gph14 = gph14(:);gph15 = gph15(:);
gph16 = gph16(:);gph17 = gph17(:);gph18 = gph18(:);gph19 = gph19(:);gph20 = gph20(:);
gph21 = gph21(:);gph22 = gph22(:);gph23 = gph23(:);gph24 = gph24(:);gph25 = gph25(:);


 [n,m,] = find(f==1);
 
  yy = [];
 for ii = 1:length(n)
 yy = [yy;datect(ih),Yct(ih),Mct(ih),Dct(ih),HHct(ih)];
 end

CT = [CT;yy(1:length(n),:),lonaf(n),lataf(m),ct1,ct2,ct3,ct4,ct5,ct6,ct7,ct8...
    ,ct9,ct10,ct11,ct12,ct13,ct14,ct15,ct16,ct17,ct18,ct19,ct20,ct21,ct22,ct23,ct24,ct25];

CTgph = [CTgph;yy(1:length(n),:),lonaf(n),lataf(m),gph1,gph2,gph3,gph4,gph5,gph6,gph7,gph8...
    ,gph9,gph10,gph11,gph12,gph13,gph14,gph15,gph16,gph17,gph18,gph19,gph20,gph21,gph22,gph23,gph24,gph25];

  
  end
  CTall = [CTall;CT];
 
  
di = CT(:,1); ilon = CT(:,6); ilat=CT(:,7);
gd= oco(:,1); glon = oco(:,6); glat=oco(:,7);

for ii = 1:length(CT)
    
    indx = find(glon(:)<=(ilon(ii)+1.5) & glon(:)>=(ilon(ii)-1.5) & ...
                glat(:)<=(ilat(ii)+1.5) & glat(:)>=(ilat(ii)-1.5));
            
          if indx~=0
              
            
             GO1 = [GO1;oco(indx,:)];
                
  end  
    
end


end 


Python代码如下:

import os
import glob
import numpy as np
import xarray as xr
from datetime import datetime as dt
from datetime import timedelta
import pandas as pd
import matplotlib.pyplot as plt
import re

def find_files_in_dir(path, substrs):   
    file_list = []
    for root, directory, files in os.walk(path):
        for f in files:
            for s in substrs:
                if s in f:
                    file_list.append(os.path.join(root, f))

    file_list.sort()
    return file_list

myFolder = '/Users/farhanmustafa/Documents/analysis2/gosat2012/'
sat_substrs = ['.nc4']
myFolderct = '/Users/farhanmustafa/Documents/analysis2/year2012/'
ct_substrs = ['.nc']
sat_file_list = find_files_in_dir(myFolder, sat_substrs)
ct_file_list = find_files_in_dir(myFolderct, ct_substrs)

CTall = pd.DataFrame(list(zip(*[]))).add_prefix('Col')
GO1 = pd.DataFrame(list(zip(*[]))).add_prefix('Col')
for i in range(len(sat_file_list)):
    print("Reading", sat_file_list[i])
    ds = xr.open_dataset(sat_file_list[i])
    ds = ds.swap_dims({'sounding_id': 'time'})
    lon_sat = ds.longitude.values.tolist()
    lat_sat = ds.latitude.values.tolist()
    xco2 = ds.xco2.values.tolist()
    err = ds.xco2_uncertainty.values.tolist()
    qflag = ds.xco2_quality_flag.values.tolist()
    KA = ds.xco2_averaging_kernel
    KAm = np.flip(KA, 0)
    ocopl = ds.pressure_levels
    ocoplm = np.flip((ocopl*100), 0)
    pw = ds.pressure_weight
    pwm = np.flip(pw)
    Xa = ds.xco2_apriori
    Xam = Xa.T
    Pa = ds.co2_profile_apriori
    time_sat = ds.time.values.tolist()
    time_sat = pd.to_datetime(time_sat)

    oco = pd.DataFrame(list(zip(*[time_sat, lon_sat, lat_sat, xco2, err, qflag]))).add_prefix('Col')
    ds.close()
    
    print("Reading", ct_file_list[i])
    ds = xr.open_dataset(ct_file_list[i])
    longitude=slice(28.5, 142.5)
    latitude=slice(-7, 55)
    ct = ds.co2.sel(longitude=longitude, latitude=latitude)
    ctgph = ds.pressure
    #time = ds.time.dt.hour[7]
    #time = time.time.values
    #ct_df = pd.DataFrame(list(zip(*[]))).add_prefix('Col')
    CT = pd.DataFrame(list(zip(*[]))).add_prefix('Col')
    #gph_df = pd.DataFrame(list(zip(*[]))).add_prefix('Col')
    CTgph = pd.DataFrame(list(zip(*[]))).add_prefix('Col')
    for i in [0,1,2,3,4,5,6,7]:
        time = ds.time.dt.hour[i]
        time = time.time.values

        ct1 = (ct.sel(level=1)).isel(time=i).values.flatten()
        ct2 = (ct.sel(level=2)).isel(time=i).values.flatten()
        ct3 = (ct.sel(level=3)).isel(time=i).values.flatten()
        ct4 = (ct.sel(level=4)).isel(time=i).values.flatten()
        ct5 = (ct.sel(level=5)).isel(time=i).values.flatten()
        ct6 = (ct.sel(level=6)).isel(time=i).values.flatten()
        ct7 = (ct.sel(level=7)).isel(time=i).values.flatten()
        ct8 = (ct.sel(level=8)).isel(time=i).values.flatten()
        ct9 = (ct.sel(level=9)).isel(time=i).values.flatten()
        ct10 = (ct.sel(level=10)).isel(time=i).values.flatten()
        ct11 = (ct.sel(level=11)).isel(time=i).values.flatten()
        ct12 = (ct.sel(level=12)).isel(time=i).values.flatten()
        ct13 = (ct.sel(level=13)).isel(time=i).values.flatten()
        ct14 = (ct.sel(level=14)).isel(time=i).values.flatten()
        ct15 = (ct.sel(level=15)).isel(time=i).values.flatten()
        ct16 = (ct.sel(level=16)).isel(time=i).values.flatten()
        ct17 = (ct.sel(level=17)).isel(time=i).values.flatten()
        ct18 = (ct.sel(level=18)).isel(time=i).values.flatten()
        ct19 = (ct.sel(level=19)).isel(time=i).values.flatten()
        ct20 = (ct.sel(level=20)).isel(time=i).values.flatten()
        ct21 = (ct.sel(level=21)).isel(time=i).values.flatten()
        ct22 = (ct.sel(level=22)).isel(time=i).values.flatten()
        ct23 = (ct.sel(level=23)).isel(time=i).values.flatten()
        ct24 = (ct.sel(level=24)).isel(time=i).values.flatten()
        ct25 = (ct.sel(level=25)).isel(time=i).values.flatten()
        
        gph1 = (ctgph.sel(boundary=1)).isel(time=i).values.flatten()
        gph2 = (ctgph.sel(boundary=2)).isel(time=i).values.flatten()
        gph3 = (ctgph.sel(boundary=3)).isel(time=i).values.flatten()
        gph4 = (ctgph.sel(boundary=4)).isel(time=i).values.flatten()
        gph5 = (ctgph.sel(boundary=5)).isel(time=i).values.flatten()
        gph6 = (ctgph.sel(boundary=6)).isel(time=i).values.flatten()
        gph7 = (ctgph.sel(boundary=7)).isel(time=i).values.flatten()
        gph8 = (ctgph.sel(boundary=8)).isel(time=i).values.flatten()
        gph9 = (ctgph.sel(boundary=9)).isel(time=i).values.flatten()
        gph10 = (ctgph.sel(boundary=10)).isel(time=i).values.flatten()
        gph11 = (ctgph.sel(boundary=11)).isel(time=i).values.flatten()
        gph12 = (ctgph.sel(boundary=12)).isel(time=i).values.flatten()
        gph13 = (ctgph.sel(boundary=13)).isel(time=i).values.flatten()
        gph14 = (ctgph.sel(boundary=14)).isel(time=i).values.flatten()
        gph15 = (ctgph.sel(boundary=15)).isel(time=i).values.flatten()
        gph16 = (ctgph.sel(boundary=16)).isel(time=i).values.flatten()
        gph17 = (ctgph.sel(boundary=17)).isel(time=i).values.flatten()
        gph18 = (ctgph.sel(boundary=18)).isel(time=i).values.flatten()
        gph19 = (ctgph.sel(boundary=19)).isel(time=i).values.flatten()
        gph20 = (ctgph.sel(boundary=20)).isel(time=i).values.flatten()
        gph21 = (ctgph.sel(boundary=21)).isel(time=i).values.flatten()
        gph22 = (ctgph.sel(boundary=22)).isel(time=i).values.flatten()
        gph23 = (ctgph.sel(boundary=23)).isel(time=i).values.flatten()
        gph24 = (ctgph.sel(boundary=24)).isel(time=i).values.flatten()
        gph25 = (ctgph.sel(boundary=25)).isel(time=i).values.flatten()

        listOfStrings2 = [time]*len(ct1)
        
        lonct = ct.longitude.values.tolist()
        latct = ct.latitude.values.tolist()

        
        lonct = [a for a in lonct  
          for b in latct if a != b] 

        latct = [b for a in lonct  
          for b in latct if a != b] 

        CT2 = pd.DataFrame(list(zip(*[listOfStrings2,lonct,latct,ct1,ct2,ct3,ct4,ct5,ct6,ct7,ct8,ct9,ct10,ct11,ct12,ct13,ct14,ct15,ct16,ct17,ct18,ct19,ct20,ct21,ct22,ct23,ct24,ct25]))).add_prefix('Col')
        CTgph = pd.DataFrame(list(zip(*[listOfStrings2,lonct,latct,gph1,gph2,gph3,gph4,gph5,gph6,gph7,gph8,gph9,gph10,gph11,gph12,gph13,gph14,gph15,gph16,gph17,gph18,gph19,gph20,gph21,gph22,gph23,gph24,gph25]))).add_prefix('Col')
        #gph_df = gph_df.append(gph_df2)
        CT = CT.append(CT2)
        ds.close()
    CTall = CTall.append(CT)
    
    
    di = CT.Col0
    ilon = CT.Col1
    ilat = CT.Col2
    sat_date = oco.Col0
    sat_lon = oco.Col1
    sat_lat = oco.Col2
    

    for ii in range(len(CT)):
        subset = oco[(ilat[ii]<=(sat_lat[:]+1.5)) & (ilat[ii]>=(sat_lat[:]-1.5)) & 
                         (ilon[ii]<=(sat_lon[:]+1.5)) & (ilon[ii]>=(sat_lon[:]-1.5))]
    
        GO1 = GO1.append(subset)
    

请注意,两种情况下的数据帧都是相同的。我会感谢有人帮助我解决这个问题。

标签: pythonmatlabindexingsubset

解决方案


推荐阅读