Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

use multivariate series with data generator #42

Open
ggous opened this issue Sep 2, 2021 · 0 comments
Open

use multivariate series with data generator #42

ggous opened this issue Sep 2, 2021 · 0 comments

Comments

@ggous
Copy link

ggous commented Sep 2, 2021

Hi, I want to use a multivariate series.

So, I have for example t2m and sm100 data. I want to use both in order to train the model but predict on t2m.

I tried to use the data generator from here but when when I call the fit method, it throws

ValueError: applied function returned data with unexpected number of dimensions. Received 1 dimension(s) but expected 0 dimensions with names: ()

at the line y = self.data.isel(forecast_time=idxs + self.lead_time).values in __getitem__ method.

Also, not that in the data generator , I have commented out the lines

self.n_samples = self.data.isel(forecast_time=slice(0, -lead_time)).shape[0]
self.init_time = self.data.isel(forecast_time=slice(None, -lead_time)).forecast_time
self.valid_time = self.data.isel(forecast_time=slice(lead_time, None)).forecast_time

( I am using self.n_samples = self.data.forecast_time.size instead)

If I use it , it throws me:

TypeError: 'DataArray' object cannot be interpreted as an integer

Any ideas about that?
Thanks!

the code:

import xarray as xr
import tensorflow as tf
import numpy as np
from collections import OrderedDict
#from tensorflow.keras.layers import Input, Conv2D, Dense
from tensorflow.keras import Model, Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, Input, Conv2D, TimeDistributed,\
    MaxPooling2D, Flatten, RepeatVector, Reshape, Lambda, GlobalAveragePooling2D,\
        Bidirectional, ConvLSTM2D, BatchNormalization
        
t2m =  xr.open_dataset("/home/ggousios/s2s-ai-challenge-agroapps/t2m.nc")
sm100 =  xr.open_dataset("/home/ggousios/s2s-ai-challenge-agroapps/sm100.nc")

lead_time = t2m.isel(lead_time=0).lead_time

class PeriodicPadding2D(tf.keras.layers.Layer):
    def __init__(self,
                 pad_width, 
                 **kwargs):
        super().__init__(**kwargs)
        self.pad_width = pad_width

    def call(self, 
             inputs,
             **kwargs):
        if self.pad_width == 0:
            return inputs
        inputs_padded = tf.concat(
            [inputs[:, :, -self.pad_width:, :],
             inputs,
             inputs[:, :, :self.pad_width, :]],
             axis=2)
        # Zero padding in the lat direction
        inputs_padded = tf.pad(inputs_padded, 
                               [[0, 0],
                                [self.pad_width,
                                 self.pad_width],
                                [0, 0], 
                                [0, 0]])
        return inputs_padded

    def get_config(self):
        config = super().get_config()
        config.update({'pad_width': self.pad_width})
        return config


class PeriodicConv2D(tf.keras.layers.Layer):
    def __init__(self,
                 filters,
                 kernel_size,
                 conv_kwargs={},
                 **kwargs, ):
        super().__init__(**kwargs)
        self.filters = filters
        self.kernel_size = kernel_size
        self.conv_kwargs = conv_kwargs
        if type(kernel_size) is not int:
            assert kernel_size[0] == kernel_size[1], \
                'PeriodicConv2D only works for square kernels'
            kernel_size = kernel_size[0]
        pad_width = (kernel_size - 1) // 2
        self.padding = PeriodicPadding2D(pad_width)
        self.conv = Conv2D(
            filters, kernel_size, padding='valid', **conv_kwargs
        )

    def call(self, inputs):
        return self.conv(self.padding(inputs))

    def get_config(self):
        config = super().get_config()
        config.update({'filters': self.filters,
                       'kernel_size': self.kernel_size, 
                       'conv_kwargs': self.conv_kwargs})
        return config
    
class DataGenerator(tf.keras.utils.Sequence):
    def __init__(self, 
                 ds,
                 var_dict,
                 lead_time,
                 batch_size, 
                 shuffle=True,
                 load=True,
                 mean=None,
                 std=None):
        """
        Data generator for WeatherBench data.
        Template from https://stanford.edu/~shervine/blog/keras-how-to-generate-data-on-the-fly
        Args:
            ds: Dataset containing all variables
            var_dict: Dictionary of the form {'var': level}. Use None for level if data is of single level
            lead_time: Lead time in hours
            batch_size: Batch size
            shuffle: bool. If True, data is shuffled.
            load: bool. If True, datadet is loaded into RAM.
            mean: If None, compute mean from data.
            std: If None, compute standard deviation from data.
        """
        self.ds = ds
        self.var_dict = var_dict
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.lead_time = lead_time

        data = []
        generic_level = xr.DataArray([1])#, coords={'realization': [1]}, dims=['realization'])
        for var, levels in var_dict.items():
            #try:
            data.append(ds[var])#.sel(realization=levels))
            #except ValueError:
            #    data.append(ds[var].expand_dims({'realization': generic_level}, 1))

        self.data = xr.concat(data, 'realization').transpose('forecast_time', ...)
        self.mean = self.data.mean(('forecast_time')).compute() if mean is None else mean
        self.std = self.data.std('forecast_time').compute() if std is None else std
        # Normalize
        self.data = (self.data - self.mean) / self.std
    
        #self.n_samples = self.data.isel(forecast_time=slice(0, -lead_time)).shape[0]
        self.n_samples = self.data.forecast_time.size
        # self.init_time = self.data.isel(forecast_time=slice(None, -lead_time)).forecast_time
        # self.valid_time = self.data.isel(forecast_time=slice(lead_time, None)).forecast_time

        self.on_epoch_end()

        # For some weird reason calling .load() earlier messes up the mean and std computations
        if load: print('Loading data into RAM'); self.data.load()

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.ceil(self.n_samples / self.batch_size))

    def __getitem__(self, i):
        'Generate one batch of data'
        idxs = self.idxs[i * self.batch_size:(i + 1) * self.batch_size]
        X = self.data.isel(forecast_time=idxs).values
        y = self.data.isel(forecast_time=idxs + self.lead_time).values
        return X, y

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.idxs = np.arange(self.n_samples)
        if self.shuffle == True:
            np.random.shuffle(self.idxs)
            
            
datasets = [t2m, sm100]
ds = xr.merge(datasets)
# I am choosing only t2m (not tp) and sm100 here
dic = OrderedDict({'t2m': None, 'sm100': None})


dg_train = DataGenerator(
    ds.sel(forecast_time=slice('2000', '2001')),
    dic,
    lead_time=lead_time,
    batch_size=8,
    load=True)

dg_valid = DataGenerator(
    ds.sel(forecast_time=slice('2018', '2019')),
    dic,
    lead_time=lead_time,
    batch_size=8,
    mean=dg_train.mean,
    std=dg_train.std,
    shuffle=False)

def custom_categ_crossentropy(y_true, y_pred, sample_weight=None):
    y_true = tf.one_hot(tf.cast(y_true,'int32'), depth=3)
    loss = tf.keras.losses.CategoricalCrossentropy()(y_true, y_pred)
    return loss

    
def build_cnn(filters,
              kernels,
              input_shape):
    
        inputs = Input(batch_shape=(8,
                                    121,
                                    240,
                                    1))
       
        print(inputs.shape)
        x = (PeriodicConv2D(filters,
                            kernels,
                            conv_kwargs={'activation':'relu'}))((inputs))
       
        x = PeriodicConv2D(32,
                           5,
                           conv_kwargs={'activation':'relu'})(x)
       
        
        output = Dense(3, activation='softmax')(x)
       
        model = Model(inputs, output)
        model.compile(optimizer='sgd',
                      loss=custom_categ_crossentropy,
                      metrics=['accuracy'])
        print(model.summary())
        return model
    
def fit():
    
    model = build_cnn(64, 9, (121, 240, 1))
    history = model.fit(dg_train,
                       epochs=10,
                       validation_data=dg_valid)
                
    return history
    
history = fit()
    

The data

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant