updates

BinaryOracle · BinaryOracle · commit 8850a210a50b · 2025-06-03T17:42:20.000+08:00
diff --git a/src/3DVL/Grounding_3D_Object_Affordance.md b/src/3DVL/Grounding_3D_Object_Affordance.md
@@ -436,7 +436,52 @@ class Cross_Attention(nn.Module):
 
         return I_1, I_2
 ```
+### Step 10: 使用分割头预测最终的 3D 可操作性热图
 
+```python
+class Head(nn.Module):
+    def __init__(self, additional_channel, emb_dim, N_p, N_raw):
+        super().__init__()
+        
+        self.emb_dim = emb_dim
+        self.N_p = N_p
+        self.N_raw = N_raw
+        #upsample
+        self.fp3 = PointNetFeaturePropagation(in_channel=512+self.emb_dim, mlp=[768, 512])  
+        self.fp2 = PointNetFeaturePropagation(in_channel=832, mlp=[768, 512]) 
+        self.fp1 = PointNetFeaturePropagation(in_channel=518+additional_channel, mlp=[512, 512]) 
+        self.pool = nn.AdaptiveAvgPool1d(1)
+
+        self.out_head = nn.Sequential(
+            nn.Linear(self.emb_dim, self.emb_dim // 8),
+            nn.BatchNorm1d(self.N_raw),
+            nn.ReLU(),
+            nn.Linear(self.emb_dim // 8, 1),
+            nn.Sigmoid()
+        )
+
+    def forward(self, multi_feature, affordance_feature, encoder_p):
+        '''
+        multi_feature ---> [B, N_p + N_i, C]
+        affordance_feature ---> [B, N_p + N_i, C]
+        encoder_p ---> [Hierarchy feature]
+        '''
+        B,N,C = multi_feature.size()
+        p_0, p_1, p_2, p_3 = encoder_p
+        P_align, _ = torch.split(multi_feature, split_size_or_sections=self.N_p, dim=1)         #[B, N_p, C] --- [B, N_i, C]
+        F_pa, _ = torch.split(affordance_feature, split_size_or_sections = self.N_p, dim=1)     #[B, N_p, C] --- [B, N_i, C]
+
+        up_sample = self.fp3(p_2[0], p_3[0], p_2[1], P_align.mT)                                #[B, emb_dim, npoint_sa2]
+        up_sample = self.fp2(p_1[0], p_2[0], p_1[1], up_sample)                                 #[B, emb_dim, npoint_sa1]                        
+        up_sample = self.fp1(p_0[0], p_1[0], torch.cat([p_0[0], p_0[1]],1), up_sample)          #[B, emb_dim, N_raw]
+        F_pa_pool = self.pool(F_pa.mT)                                                          #[B, emb_dim, 1]
+        
+        affordance = up_sample * F_pa_pool.expand(-1,-1,self.N_raw)                             #[B, emb_dim, 2048]
+        
+        out = self.out_head(affordance.mT)                                                      #[B, 2048, 1]
+
+        return out
+```
 
 
 
diff --git a/src/3DVL/简析PointNet++.md b/src/3DVL/简析PointNet++.md
@@ -551,3 +551,237 @@ MRG通过结合来自不同分辨率的特征来实现效率和适应性的平
 > 来自下一级的特征：首先，将来自下一级（更高分辨率）的特征进行汇总，形成一个特征向量。这一过程通过对每个子区域应用集合抽象层（set abstraction level）完成。
 
 > 直接处理的原始点特征：另一部分特征是通过在当前分辨率直接对所有原始点应用单个PointNet得到的。
+
+## 点云语义分割
+
+**PointNet++ 完成点云分割任务的过程是一个典型的“编码-解码”结构**，结合了层级特征提取和多尺度融合机制。
+
+**目标:** 给定一个点云，模型需要为每个点预测一个类别标签（如桌子、椅子、墙壁等）。
+
+- 输入：`xyz: [B, N, 3]`  
+
+- 输出：`labels: [B, N, C]`，其中 `C` 是类别数
+
+
+PointNet++ 分割的整体结构 :
+
+```python
+Input Points (xyz): [ B, N, 3 ]
+        ↓
+Set Abstraction Layers（编码器）
+        ↓
+Feature Vectors at Multiple Scales
+        ↓
+Feature Propagation Layers（解码器）
+        ↓
+Recovered Features at Original Resolution
+        ↓
+MLP + Softmax → Per-point Semantic Labels
+```
+---
+
+**第一步：Set Abstraction（集合抽象）—— 编码器:** 对点云进行下采样，并在每个局部区域提取特征。
+
+核心操作包括：
+
+1. **FPS（Farthest Point Sampling）**：从点云中选出有代表性的点作为中心点。
+
+2. **Ball Query**：为每个中心点找到其邻域内的点。
+
+3. **Grouping**：将邻域点组合成局部点云组。
+
+4. **PointNet 操作**：使用 T-Net 对局部点云进行变换，然后通过 MLP 提取特征。
+
+5. **Pooling**：对局部点云组做最大池化或平均池化，得到该区域的特征。
+
+> 多个 Set Abstraction 层堆叠，逐步减少点的数量，增加特征维度，形成多尺度特征表示。
+
+---
+
+**第二步：Feature Propagation（特征传播）—— 解码器:** 从最稀疏的点开始，逐层将特征插值回原始点数量。
+
+特征插值方式：
+
+- 使用 **反距离加权插值（IDW）**，即根据最近的几个邻近点的距离进行加权平均。
+
+- 可选地拼接 skip connection 中的原始特征（来自 Set Abstraction 前的某一层）。
+
+输入输出示例：
+
+```python
+def forward(xyz1, xyz2, points1, points2):
+    # xyz1: 原始点坐标（多）
+    # xyz2: 下采样点坐标（少）
+    # points1: 原始点特征（可为空）
+    # points2: 下采样点特征
+    return interpolated_points  # 插值得到的密集特征，形状与 xyz1 一致
+```
+
+> 多个 Feature Propagation 层堆叠，逐渐恢复点数，最终回到原始点数量。
+
+---
+
+**第三步：Head 预测头 —— 分类每个点:** 对每个点的特征做一个简单的分类器，输出类别概率。
+
+实现方式：
+
+- 将最后一层 Feature Propagation 输出的特征送入一个小型 MLP。
+
+- 最后一层使用 `Softmax`（对于多分类）或 `Sigmoid`（对于多标签）激活函数。
+
+例如：
+
+```python
+mlp = nn.Sequential(
+    nn.Conv1d(128, 128, 1),
+    nn.BatchNorm1d(128),
+    nn.ReLU(),
+    nn.Dropout(0.5),
+    nn.Conv1d(128, num_classes, 1)
+)
+logits = mlp(final_features)  # shape: [B, C, N]
+```
+
+### 代码实现
+
+PointNet++ 的整体结构是一个典型的 编码器-解码器（Encoder-Decoder）架构 ：
+
+- Set Abstraction 层 ：不断对点云进行下采样 + 提取局部特征（编码过程）
+
+- Feature Propagation 层 ：从最稀疏的点开始，逐层恢复到原始点数（解码过程）
+
+```
+[Input Points] 
+      ↓
+SA Layer 1 → [Points: 1024 → 512]
+      ↓
+SA Layer 2 → [Points: 512 → 128]
+      ↓
+SA Layer 3 → [Points: 128 → 32]
+      ↓
+FP Layer 3 ← [Points: 32 → 128]
+      ↓
+FP Layer 2 ← [Points: 128 → 512]
+      ↓
+FP Layer 1 ← [Points: 512 → 1024]
+      ↓
+[Per-point Classification Head]
+      ↓
+[Output: per-point labels]
+```
+
+#### 特征传播层
+
+PointNetFeaturePropagation 是 PointNet++ 中用于点云“特征传播”（Feature Propagation）的核心模块，主要作用是：
+
+- 将稀疏点集的特征插值回原始点集的位置上。
+
+换句话说：
+
+- 输入：少量点的坐标 + 特征（如经过下采样后的点）
+
+- 输出：在原始点数量下的每个点都拥有一个合理的特征向量
+
+这一步相当于图像任务中的 上采样（upsample）或转置卷积（transpose convolution） ，但在点云这种非结构化数据中，不能直接使用这些操作。
+
+```python
+class PointNetFeaturePropagation(nn.Module):
+    def __init__(self, in_channel, mlp):
+        """
+        初始化函数，构建用于特征传播（上采样）的MLP层
+        
+        参数：
+            in_channel: 输入特征的通道数（维度）
+            mlp: 一个列表，表示每一层MLP的输出通道数，例如 [64, 128]
+        """
+        super(PointNetFeaturePropagation, self).__init__()
+        
+        # 用于保存卷积层和批归一化层
+        self.mlp_convs = nn.ModuleList()
+        self.mlp_bns = nn.ModuleList()
+        
+        last_channel = in_channel  # 当前输入通道数初始化为in_channel
+
+        # 构建MLP层：每个层是一个Conv1d + BatchNorm1d + ReLU
+        for out_channel in mlp:
+            self.mlp_convs.append(nn.Conv1d(last_channel, out_channel, 1))
+            self.mlp_bns.append(nn.BatchNorm1d(out_channel))
+            last_channel = out_channel  # 更新下一层的输入通道数
+
+    def forward(self, xyz1, xyz2, points1, points2):
+        """
+        前向传播函数：将稀疏点集points2插值到密集点集xyz1的位置上
+
+        参数：
+            xyz1: 原始点坐标数据，形状 [B, C, N] （如 1024 个点）
+            xyz2: 下采样后的点坐标数据，形状 [B, C, S] （如 256 个点）
+            points1: 原始点对应的特征数据，形状 [B, D, N] （可为 None）
+            points2: 下采样点对应的特征数据，形状 [B, D, S]
+
+        返回：
+            new_points: 插值并融合后的特征，形状 [B, D', N]
+        """
+        # 将坐标和特征从 [B, C, N] 转换为 [B, N, C] 格式，便于后续计算
+        xyz1 = xyz1.permute(0, 2, 1)     # [B, N, C]
+        xyz2 = xyz2.permute(0, 2, 1)     # [B, S, C]
+        points2 = points2.permute(0, 2, 1)  # [B, S, D]
+
+        B, N, C = xyz1.shape             # 原始点数量 N
+        _, S, _ = xyz2.shape             # 下采样点数量 S
+
+        # 如果只有1个下采样点，直接复制其特征到所有原始点
+        if S == 1:
+            interpolated_points = points2.repeat(1, N, 1)  # [B, N, D]
+
+        else:
+            # 计算原始点与下采样点之间的距离矩阵（欧氏距离平方）
+            dists = square_distance(xyz1, xyz2)  # [B, N, S]
+
+            # 对每个原始点，找到最近的3个邻近点
+            dists, idx = dists.sort(dim=-1)
+            dists = dists[:, :, :3]   # 取最小的三个距离 [B, N, 3]
+            idx = idx[:, :, :3]       # 取对应的索引 [B, N, 3]
+
+            # 使用反距离加权（IDW）计算权重:
+            # 1.将距离转换为“权重”，距离越近，权重越大
+            dist_recip = 1.0 / (dists + 1e-8)  # 避免除以零
+            # 2.对每个点的3个权重求和，得到归一化因子
+            norm = torch.sum(dist_recip, dim=2, keepdim=True)  # 归一化因子
+            # 3.归一化权重，使得每个点的权重之和为1
+            weight = dist_recip / norm  # 加权平均系数 [B, N, 3]
+
+            # 为每个原始点，找到它最近的 3 个邻近点，根据距离分配权重，然后对它们的特征做加权平均，从而插值得到该点的特征。 
+            # index_points: [B, S, D] -> [B, N, 3, D]
+            # weight.view(B, N, 3, 1): 扩展维度后相乘
+            interpolated_points = torch.sum(
+                # 1. 从下采样点中取出每个原始点对应的最近邻点的特征。
+                #    points2: [B, S, D] —— 下采样点的特征（S 个点，每个点有 D 维特征）
+                #    idx: [B, N, 3] —— 每个原始点对应的 3 个最近邻点索引
+                #    [B, N, 3, D] —— 每个原始点都有了它最近的 3 个邻近点的特征
+                index_points(points2, idx) 
+                # 将之前计算好的权重扩展维度，以便和特征相乘。
+                # weight: [B, N, 3] —— 每个点的三个邻近点的权重
+                # [B, N, 3, 1] —— 扩展后便于广播乘法
+                * weight.view(B, N, 3, 1),
+                dim=2
+            )  # [B, N, D]
+
+        # 如果原始点有特征，则拼接起来（skip connection）
+        if points1 is not None:
+            points1 = points1.permute(0, 2, 1)  # [B, N, D]
+            new_points = torch.cat([points1, interpolated_points], dim=-1)  # [B, N, D1+D2]
+        else:
+            new_points = interpolated_points  # [B, N, D]
+
+        # 恢复张量格式为 [B, D, N]，以适配后面的卷积操作
+        new_points = new_points.permute(0, 2, 1)  # [B, D', N]
+
+        # 经过MLP进一步提取和融合特征
+        for i, conv in enumerate(self.mlp_convs):
+            bn = self.mlp_bns[i]
+            new_points = F.relu(bn(conv(new_points)))  # Conv1d + BN + ReLU
+
+        return new_points  # 最终输出特征 [B, D', N]
+```
+
+