feat(loss): update seflow loss calculation.

Kin-Zhang · Kin-Zhang · commit 6659c896051a · 2024-07-07T13:59:00.000+02:00
some notes for pointing out the equation to the paper.
diff --git a/1_train.py b/1_train.py
@@ -28,10 +28,11 @@
 
 @hydra.main(version_base=None, config_path="conf", config_name="config")
 def main(cfg):
+    if cfg.loss_fn == 'seflowLoss' and cfg.add_seloss is None:
+        raise ValueError("Please specify the self-supervised loss items for seflowLoss.")
     pl.seed_everything(cfg.seed, workers=True)
-    output_dir = HydraConfig.get().runtime.output_dir
 
-    train_dataset = HDF5Dataset(cfg.train_data)
+    train_dataset = HDF5Dataset(cfg.train_data, dufo=(cfg.loss_fn == 'seflowLoss'))
     train_loader = DataLoader(train_dataset,
                               batch_size=cfg.batch_size,
                               shuffle=True,
@@ -48,7 +49,14 @@ def main(cfg):
     
     # count gpus, overwrite gpus
     cfg.gpus = torch.cuda.device_count() if torch.cuda.is_available() else 0
-    model_name = cfg.model.name
+
+    # only for logging on folder name.
+    if cfg.loss_fn == 'seflowLoss':
+        method_name = "seflow"
+        cfg.output = cfg.output.replace("deflow", "seflow")
+    else:
+        method_name = cfg.model.name
+    output_dir = HydraConfig.get().runtime.output_dir + f"/{cfg.output}"
     Path(os.path.join(output_dir, "checkpoints")).mkdir(parents=True, exist_ok=True)
     
     cfg = DictConfig(OmegaConf.to_container(cfg, resolve=True))
@@ -57,7 +65,7 @@ def main(cfg):
     callbacks = [
         ModelCheckpoint(
             dirpath=os.path.join(output_dir, "checkpoints"),
-            filename="{epoch:02d}_"+model_name,
+            filename="{epoch:02d}_"+method_name,
             auto_insert_metric_name=False,
             monitor=cfg.model.val_monitor,
             mode="min",
@@ -90,6 +98,9 @@ def main(cfg):
         print("Initiating wandb and trainer successfully.  ^V^ ")
         print(f"We will use {cfg.gpus} GPUs to train the model. Check the checkpoints in {output_dir} checkpoints folder.")
         print("Total Train Dataset Size: ", len(train_dataset))
+        if cfg.add_seloss is not None and cfg.loss_fn == 'seflowLoss':
+            print(f"Note: We are in **self-supervised** training now. No ground truth label is used.")
+            print(f"We will use these loss items in {cfg.loss_fn}: {cfg.add_seloss}")
         print("-"*40+"\n")
 
     # NOTE(Qingwen): search & check: def training_step(self, batch, batch_idx)
diff --git a/README.md b/README.md
@@ -6,7 +6,7 @@ SeFlow: A Self-Supervised Scene Flow Method in Autonomous Driving
 [poster comming soon]
 [video coming soon]
 
-2024/07/05 11:35: I'm working on updating code here now. **Not fully ready yet** until Jul'15.
+2024/07/07 13:45: I'm working on updating code here now. **Not fully ready yet** until Jul'15.
 
 Pre-trained weights for models are available in [Zenodo](https://zenodo.org/records/12632962) link. Check usage in [2. Evaluation](#2-evaluation) or [3. Visualization](#3-visualization).
 
@@ -60,7 +60,19 @@ docker run -it --gpus all -v /dev/shm:/dev/shm -v /home/kin/data:/home/kin/data
 
 Note: Prepare raw data and process train data only needed run once for the task. No need to run till you delete all data.
 
-### Prepare raw data 
+### Data Preparation
+
+Check [dataprocess/README.md](dataprocess/README.md#argoverse-20) for downloading tips for the raw Argoverse 2 dataset
+
+Maybe you only want to have the mini processed dataset to try the code quickly, We directly provide one scene inside `train` and `val`. It already converted to `.h5` format and processed with the label data. 
+<!-- You can download it from [Zenodo](https://zenodo.org/record/12632962) and extract it to the data folder. -->
+```bash
+# TODO: update the link later when the data is ready
+# wget https://zenodo.org/record/12632962/files/demo_data.zip
+unzip demo_data.zip -p /home/kin/data/av2
+```
+
+#### Prepare raw data 
 
 Extract all data to unified h5 format. [Runtime: Normally need 10 mins finished run following commands totally in my desktop, 45 mins for the cluster I used]
 ```bash
@@ -69,7 +81,7 @@ python dataprocess/extract_av2.py --av2_type sensor --data_mode val --mask_dir /
 python dataprocess/extract_av2.py --av2_type sensor --data_mode test --mask_dir /home/kin/data/av2/3d_scene_flow
 ```
 
-### Process train data
+#### Process train data
 
 Process train data for self-supervised learning. Only training data needs this step. [Runtime: Normally need 15 hours for my desktop, 3 hours for the cluster with five available nodes parallel running.]
 
@@ -85,6 +97,13 @@ Train SeFlow needed to specify the loss function, we set the config of our best
 python 1_train.py model=deflow lr=2e-4 epochs=20 batch_size=16 loss_fn=seflowLoss "add_seloss={chamfer_dis: 1.0, static_flow_loss: 1.0, dynamic_chamfer_dis: 1.0, cluster_based_pc0pc1: 1.0}" "model.target.num_iters=2" "model.val_monitor=val/Dynamic/Mean"
 ```
 
+### Other Benchmark Models
+
+```bash
+python 1_train.py model=fastflow3d lr=2e-4 epochs=20 batch_size=16 loss_fn=deflowLoss
+python 1_train.py model=deflow lr=2e-4 epochs=20 batch_size=16 loss_fn=ff3dLoss
+```
+
 ## 2. Evaluation
 
 You can view Wandb dashboard for the training and evaluation results or upload result to online leaderboard.
@@ -95,8 +114,9 @@ Since in training, we save all hyper-parameters and model checkpoints, the only
 # downloaded pre-trained weight, or train by yourself
 wget https://zenodo.org/records/12632962/files/seflow_official.ckpt
 
+# it will directly prints all metric
+python 2_eval.py checkpoint=/home/kin/seflow_official.ckpt av2_mode=val
 
-python 2_eval.py checkpoint=/home/kin/seflow_official.ckpt av2_mode=val # it will directly prints all metric
 # it will output the av2_submit.zip or av2_submit_v2.zip for you to submit to leaderboard
 python 2_eval.py checkpoint=/home/kin/seflow_official.ckpt av2_mode=test leaderboard_version=1
 python 2_eval.py checkpoint=/home/kin/seflow_official.ckpt av2_mode=test leaderboard_version=2
diff --git a/assets/cuda/mmcv/setup.py b/assets/cuda/mmcv/setup.py
@@ -11,14 +11,21 @@
             name='mmcv._ext',
             sources=[
                 "/".join(__file__.split("/")[:-1] + ["scatter_points_cuda.cu"]),
+                "/".join(__file__.split("/")[:-1] + ["scatter_points.cpp"]),
                 "/".join(__file__.split("/")[:-1] + ["voxelization_cuda.cu"]),
                 "/".join(__file__.split("/")[:-1] + ["voxelization.cpp"]),
-                "/".join(__file__.split("/")[:-1] + ["scatter_points.cpp"]),
                 "/".join(__file__.split("/")[:-1] + ["cudabind.cpp"]),
                 "/".join(__file__.split("/")[:-1] + ["pybind.cpp"]),
 
-            ]),
-            # extra_compile_args={'cxx': ['-g'], 'nvcc': ['-O2']}
+            ],
+            # extra_compile_args={
+            #     'cxx': ['-std=c++17'], 
+            #     'nvcc': ['-std=c++17',
+            #     '-D__CUDA_NO_HALF_OPERATORS__',
+            #     '-D__CUDA_NO_HALF_CONVERSIONS__',
+            #     '-D__CUDA_NO_HALF2_OPERATORS__',
+            #              ],}
+            ),
     ],
     cmdclass={'build_ext': BuildExtension},
 
diff --git a/assets/slurm/1_train.sh b/assets/slurm/1_train.sh
@@ -1,15 +1,15 @@
 #!/bin/bash
-#SBATCH -J deflow
-#SBATCH --gpus 8 -C "thin"
+#SBATCH -J seflow
+#SBATCH --gpus 4 -C "fat"
 #SBATCH -t 3-00:00:00
 #SBATCH --mail-type=END,FAIL
 #SBATCH --mail-user=qingwen@kth.se
-#SBATCH --output /proj/berzelius-2023-154/users/x_qinzh/deflow/logs/slurm/%J_deflow.out
-#SBATCH --error  /proj/berzelius-2023-154/users/x_qinzh/deflow/logs/slurm/%J_deflow.err
+#SBATCH --output /proj/berzelius-2023-154/users/x_qinzh/seflow/logs/slurm/%J_seflow.out
+#SBATCH --error  /proj/berzelius-2023-154/users/x_qinzh/seflow/logs/slurm/%J_seflow.err
 
-cd /proj/berzelius-2023-154/users/x_qinzh/deflow
+cd /proj/berzelius-2023-154/users/x_qinzh/seflow
 
-SOURCE="/proj/berzelius-2023-154/users/x_qinzh/av2/deflow_preprocess"
+SOURCE="/proj/berzelius-2023-154/users/x_qinzh/data/av2/seflow_preprocess"
 DEST="/scratch/local/av2"
 SUBDIRS=("sensor/train" "sensor/val")
 
@@ -24,55 +24,14 @@ elapsed=$((end_time - start_time))
 echo "Copy ${SOURCE} to ${DEST} Total time: ${elapsed} seconds"
 echo "Start training..."
 
-# ====> leaderboard model = [fastflow3d, deflow]
-# /proj/berzelius-2023-154/users/x_qinzh/mambaforge/envs/deflow/bin/python 1_train.py \
-#     slurm_id=$SLURM_JOB_ID wandb_mode=online dataset_path=/scratch/local/av2/sensor \
-#     num_workers=16 model=deflow lr=2e-6 epochs=50 batch_size=10 loss_fn=deflowLoss
-
-# /proj/berzelius-2023-154/users/x_qinzh/mambaforge/envs/deflow/bin/python 1_train.py \
-#     slurm_id=$SLURM_JOB_ID wandb_mode=online dataset_path=/scratch/local/av2/sensor \
-#     num_workers=16 model=fastflow3d lr=2e-6 epochs=50 batch_size=16 loss_fn=ff3dLoss
-
-
-
-
-# ===> ablation A: iteration num [2, 4 (R), 8, 16]
-# /proj/berzelius-2023-154/users/x_qinzh/mambaforge/envs/deflow/bin/python 1_train.py \
-#     slurm_id=$SLURM_JOB_ID wandb_mode=online dataset_path=/scratch/local/av2/sensor \
-#     num_workers=16 model=deflow lr=2e-6 epochs=50 batch_size=10 loss_fn=deflowLoss "model.target.num_iters=2"
-
-# /proj/berzelius-2023-154/users/x_qinzh/mambaforge/envs/deflow/bin/python 1_train.py \
-#     slurm_id=$SLURM_JOB_ID wandb_mode=online dataset_path=/scratch/local/av2/sensor \
-#     num_workers=16 model=deflow lr=2e-6 epochs=50 batch_size=8 loss_fn=deflowLoss "model.target.num_iters=8"
-
-# /proj/berzelius-2023-154/users/x_qinzh/mambaforge/envs/deflow/bin/python 1_train.py \
-#     slurm_id=$SLURM_JOB_ID wandb_mode=online dataset_path=/scratch/local/av2/sensor \
-#     num_workers=16 model=deflow lr=2e-6 epochs=50 batch_size=10 loss_fn=deflowLoss "model.target.num_iters=16"
-
-
-# ===> ablation B: loss_fn --- loss_fn = [ff3dLoss (R), zeroflowLoss, deflowLoss]
-# /proj/berzelius-2023-154/users/x_qinzh/mambaforge/envs/deflow/bin/python 1_train.py \
-#     slurm_id=$SLURM_JOB_ID wandb_mode=online dataset_path=/scratch/local/av2/sensor \
-#     num_workers=16 model=fastflow3d lr=2e-6 epochs=50 batch_size=16 loss_fn=zeroflowLoss
-
-# /proj/berzelius-2023-154/users/x_qinzh/mambaforge/envs/deflow/bin/python 1_train.py \
-#     slurm_id=$SLURM_JOB_ID wandb_mode=online dataset_path=/scratch/local/av2/sensor \
-#     num_workers=16 model=fastflow3d lr=2e-6 epochs=50 batch_size=16 loss_fn=deflowLoss
-
-
-# ===> ablation C: decoder --- model.target.decoder_option = [linear, gru] and fastflow3d resolution [0.1, 0.2 (R), 0.4]
-# /proj/berzelius-2023-154/users/x_qinzh/mambaforge/envs/deflow/bin/python 1_train.py \
-#     slurm_id=$SLURM_JOB_ID wandb_mode=online dataset_path=/scratch/local/av2/sensor \
-#     num_workers=16 model=deflow lr=2e-6 epochs=50 batch_size=10 loss_fn=ff3dLoss "model.target.decoder_option=linear"
-
-# /proj/berzelius-2023-154/users/x_qinzh/mambaforge/envs/deflow/bin/python 1_train.py \
-#     slurm_id=$SLURM_JOB_ID wandb_mode=online dataset_path=/scratch/local/av2/sensor \
-#     num_workers=16 model=deflow lr=2e-6 epochs=50 batch_size=10 loss_fn=ff3dLoss "model.target.decoder_option=gru"
-
-# /proj/berzelius-2023-154/users/x_qinzh/mambaforge/envs/deflow/bin/python 1_train.py \
-#     slurm_id=$SLURM_JOB_ID wandb_mode=online dataset_path=/scratch/local/av2/sensor \
-#     num_workers=16 model=fastflow3d lr=2e-6 epochs=50 batch_size=10 loss_fn=ff3dLoss "voxel_size=[0.1, 0.1, 6]"
-
-# /proj/berzelius-2023-154/users/x_qinzh/mambaforge/envs/deflow/bin/python 1_train.py \
-#     slurm_id=$SLURM_JOB_ID wandb_mode=online dataset_path=/scratch/local/av2/sensor \
-#     num_workers=16 model=fastflow3d lr=2e-6 epochs=50 batch_size=16 loss_fn=ff3dLoss "voxel_size=[0.4, 0.4, 6]"
+# ====> paper model = seflow_official
+# /proj/berzelius-2023-154/users/x_qinzh/mambaforge/envs/seflow/bin/python 1_train.py \
+#     slurm_id=$SLURM_JOB_ID wandb_mode=online train_data=/scratch/local/av2/sensor/train val_data=/scratch/local/av2/sensor/val \
+#     num_workers=16 model=deflow lr=2e-6 epochs=50 batch_size=20 "model.target.num_iters=2" "model.val_monitor=val/Dynamic/Mean" \
+#     loss_fn=seflowLoss "add_seloss={chamfer_dis: 1.0, static_flow_loss: 1.0, dynamic_chamfer_dis: 1.0, cluster_based_pc0pc1: 1.0}"
+
+# ====> leaderboard model = seflow_best
+/proj/berzelius-2023-154/users/x_qinzh/mambaforge/envs/seflow/bin/python 1_train.py \
+    slurm_id=$SLURM_JOB_ID wandb_mode=online train_data=/scratch/local/av2/sensor/train val_data=/scratch/local/av2/sensor/val \
+    num_workers=16 model=deflow lr=2e-4 epochs=20 batch_size=16 "model.target.num_iters=2" "model.val_monitor=val/Dynamic/Mean" \
+    loss_fn=seflowLoss "add_seloss={chamfer_dis: 1.0, static_flow_loss: 1.0, dynamic_chamfer_dis: 1.0, cluster_based_pc0pc1: 1.0}"
diff --git a/conf/config.yaml b/conf/config.yaml
@@ -27,9 +27,8 @@ gradient_clip_val: 5.0
 
 # optimizer ==> Adam
 lr: 2e-6
-loss_fn: deflowLoss # choices: [ff3dLoss, zeroflowLoss, deflowLoss, seflowLoss]
-add_seloss: {chamfer_dis: 1.0, static_flow_loss: 1.0, dynamic_chamfer_dis: 1.0, cluster_based_pc0pc1: 1.0}
-label_name: label # choices: [label, nnd_label_plus]
+loss_fn: seflowLoss # choices: [ff3dLoss, zeroflowLoss, deflowLoss, seflowLoss]
+add_seloss: 
 
 # log settings
 seed: 42069
diff --git a/conf/hydra/default.yaml b/conf/hydra/default.yaml
@@ -1,2 +1,2 @@
 run:
-  dir: logs/wandb/${output}
+  dir: logs/wandb
diff --git a/dataprocess/extract_av2.py b/dataprocess/extract_av2.py
@@ -67,6 +67,8 @@ def create_eval_mask(data_mode: str, output_dir_: Path, mask_dir: str):
         timestamps = sorted([int(file.replace('.feather', ''))
                         for file in os.listdir(Path(mask_dir) / f"{data_mode}-masks" / scene_id)
                         if file.endswith('.feather')])
+        if not os.path.exists(output_dir_ / f'{scene_id}.h5'):
+            continue
         with h5py.File(output_dir_ / f'{scene_id}.h5', 'r+') as f:
             for ts in timestamps:
                 key = str(ts)
diff --git a/scripts/network/dataloader.py b/scripts/network/dataloader.py
@@ -40,10 +40,20 @@ def collate_fn_pad(batch):
 
     if 'ego_motion' in batch[0]:
         res_dict['ego_motion'] = [batch[i]['ego_motion'] for i in range(len(batch))]
+        
+    if 'pc0_dynamic' in batch[0]:
+        pc0_dynamic_after_mask_ground, pc1_dynamic_after_mask_ground= [], []
+        for i in range(len(batch)):
+            pc0_dynamic_after_mask_ground.append(batch[i]['pc0_dynamic'][~batch[i]['gm0']])
+            pc1_dynamic_after_mask_ground.append(batch[i]['pc1_dynamic'][~batch[i]['gm1']])
+        pc0_dynamic_after_mask_ground = torch.nn.utils.rnn.pad_sequence(pc0_dynamic_after_mask_ground, batch_first=True, padding_value=0)
+        pc1_dynamic_after_mask_ground = torch.nn.utils.rnn.pad_sequence(pc1_dynamic_after_mask_ground, batch_first=True, padding_value=0)
+        res_dict['pc0_dynamic'] = pc0_dynamic_after_mask_ground
+        res_dict['pc1_dynamic'] = pc1_dynamic_after_mask_ground
 
     return res_dict
 class HDF5Dataset(Dataset):
-    def __init__(self, directory, eval = False, leaderboard_version=1):
+    def __init__(self, directory, dufo=False, eval = False, leaderboard_version=1):
         '''
         directory: the directory of the dataset
         eval: if True, use the eval index
@@ -55,6 +65,7 @@ def __init__(self, directory, eval = False, leaderboard_version=1):
             self.data_index = pickle.load(f)
 
         self.eval_index = False
+        self.dufo = dufo
         if eval:
             index_file_name = 'index_eval.pkl'
             if leaderboard_version == 2:
@@ -106,12 +117,12 @@ def __getitem__(self, index_):
 
         key = str(timestamp)
         with h5py.File(os.path.join(self.directory, f'{scene_id}.h5'), 'r') as f:
-            pc0 = torch.tensor(f[key]['lidar'][:])
+            pc0 = torch.tensor(f[key]['lidar'][:][:,:3])
             gm0 = torch.tensor(f[key]['ground_mask'][:])
             pose0 = torch.tensor(f[key]['pose'][:])
 
             next_timestamp = str(self.data_index[index_+1][1])
-            pc1 = torch.tensor(f[next_timestamp]['lidar'][:])
+            pc1 = torch.tensor(f[next_timestamp]['lidar'][:][:,:3])
             gm1 = torch.tensor(f[next_timestamp]['ground_mask'][:])
             pose1 = torch.tensor(f[next_timestamp]['pose'][:])
             # if pc0[~gm0].shape[0] == 0:
@@ -143,10 +154,15 @@ def __getitem__(self, index_):
                 ego_motion = torch.tensor(f[key]['ego_motion'][:])
                 res_dict['ego_motion'] = ego_motion
 
+            if self.dufo:
+                res_dict['pc0_dynamic'] = torch.tensor(f[key]['label'][:].astype('int16'))
+                res_dict['pc1_dynamic'] = torch.tensor(f[next_timestamp]['label'][:].astype('int16'))
+
             if self.eval_index:
                 # looks like v2 not follow the same rule as v1 with eval_mask provided
                 eval_mask = torch.tensor(f[key]['eval_mask'][:]) if 'eval_mask' in f[key] else torch.ones_like(pc0[:, 0], dtype=torch.bool)
                 res_dict['eval_mask'] = eval_mask
+
         return res_dict
 
 if __name__ == "__main__":
diff --git a/scripts/network/loss_func.py b/scripts/network/loss_func.py
diff --git a/scripts/pl_model.py b/scripts/pl_model.py

Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,2 @@`
`1`	`1`	`run:`
`2`		`- dir: logs/wandb/${output}`
	`2`	`+ dir: logs/wandb`