ImageClassification/auto_classifier.py at master · cgx-avenue/ImageClassification · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
""" 自动图片相似度聚类分类器
# 默认最小聚类大小为3张
python auto_classifier.py /path/to/images

# 设置最小聚类大小为5张
python auto_classifier.py /path/to/images --min-cluster-size 5

# 组合参数使用
python auto_classifier.py /path/to/images --threshold 0.8 --min-cluster-size 4

"""
import cv2
import numpy as np
import os
import shutil
from pathlib import Path
from tqdm import tqdm
from typing import List, Tuple, Dict, Set
from datetime import datetime
import json
# 添加对HEIC格式的支持
try:
    from PIL import Image
    import pillow_heif
    # 注册HEIF opener到PIL
    pillow_heif.register_heif_opener()
    HEIC_SUPPORT = True
except ImportError:
    HEIC_SUPPORT = False
    print("警告: 未安装pillow-heif，将不支持HEIC格式")


class ImageClusterClassifier:
    """
    自动图片相似度聚类分类器
    """

    def __init__(self, similarity_threshold: float = 0.75, min_cluster_size: int = 3):
        """
        初始化分类器

        Args:
            similarity_threshold: 相似度阈值，默认0.75
            min_cluster_size: 最小聚类大小，默认3张
        """
        self.similarity_threshold = similarity_threshold
        self.min_cluster_size = min_cluster_size
        self.image_features = {}  # 存储每张图片的特征
        self.clusters = []  # 存储聚类结果
        self.processed_images = set()  # 已处理的图片

        # 支持的图片格式
        self.supported_formats = {
            '.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.tif', '.webp',
            '.gif', '.svg', '.ico', '.psd', '.raw', '.cr2', '.nef', '.arw'
        }

        # 如果支持HEIC，添加到格式列表
        if HEIC_SUPPORT:
            self.supported_formats.update({'.heic', '.heif'})

    def load_image(self, image_path: str) -> np.ndarray:
        """
        加载图像，支持多种格式包括HEIC
        """
        file_ext = Path(image_path).suffix.lower()

        try:
            # 对于HEIC格式，使用PIL加载后转换为OpenCV格式
            if file_ext in {'.heic', '.heif'} and HEIC_SUPPORT:
                pil_image = Image.open(image_path)
                # 确保是RGB模式
                if pil_image.mode != 'RGB':
                    pil_image = pil_image.convert('RGB')
                # 转换为numpy数组 (RGB -> BGR for OpenCV)
                img_array = np.array(pil_image)
                img = cv2.cvtColor(img_array, cv2.COLOR_RGB2BGR)
                return img
            else:
                # 对于其他格式，直接使用OpenCV
                img = cv2.imread(image_path)
                return img
        except Exception as e:
            print(f"加载图片失败 {image_path}: {str(e)}")
            return None

    def get_all_image_files(self, folder_path: str) -> List[str]:
        """
        递归获取文件夹及其所有子文件夹中的图片文件
        """
        image_files = []
        folder_path = Path(folder_path)

        print(f"正在扫描目录: {folder_path}")

        # 使用rglob递归搜索所有文件
        all_files = list(folder_path.rglob('*'))

        print(f"找到 {len(all_files)} 个文件，正在筛选图片...")

        for file_path in tqdm(all_files, desc="扫描文件"):
            if file_path.is_file():
                file_ext = file_path.suffix.lower()
                if file_ext in self.supported_formats:
                    image_files.append(str(file_path))

        # 按文件名排序，便于组织
        image_files.sort()

        print(f"找到 {len(image_files)} 张图片文件")

        # 显示格式统计
        format_stats = {}
        for img_path in image_files:
            ext = Path(img_path).suffix.lower()
            format_stats[ext] = format_stats.get(ext, 0) + 1

        print("图片格式统计:")
        for ext, count in sorted(format_stats.items()):
            print(f"  {ext}: {count} 张")

        return image_files

    def calculate_image_features(self, image_path: str) -> Dict:
        """
        计算图像的多种特征
        """
        img = self.load_image(image_path)
        if img is None:
            return None

        features = {}

        try:
            # 1. HSV直方图特征
            hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
            hist = cv2.calcHist([hsv], [0, 1, 2], None, [50, 60, 60], [0, 180, 0, 256, 0, 256])
            cv2.normalize(hist, hist)
            features['histogram'] = hist

            # 2. 图像尺寸和宽高比
            height, width = img.shape[:2]
            features['dimensions'] = (width, height)
            features['aspect_ratio'] = width / height

            # 3. 平均颜色
            features['mean_color'] = np.mean(img.reshape(-1, 3), axis=0)

            # 4. 边缘特征
            gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
            edges = cv2.Canny(gray, 50, 150)
            features['edge_density'] = np.sum(edges > 0) / (width * height)

            # 5. 文件大小和路径信息
            features['file_size'] = os.path.getsize(image_path)
            features['file_path'] = image_path
            features['file_format'] = Path(image_path).suffix.lower()

            return features

        except Exception as e:
            print(f"提取特征失败 {image_path}: {str(e)}")
            return None

    def calculate_similarity_score(self, features1: Dict, features2: Dict) -> float:
        """
        计算两张图片的综合相似度得分
        """
        if not features1 or not features2:
            return 0.0

        scores = []
        weights = []

        # 1. 直方图相似度 (权重: 0.4)
        if 'histogram' in features1 and 'histogram' in features2:
            hist_sim = cv2.compareHist(features1['histogram'], features2['histogram'], cv2.HISTCMP_CORREL)
            scores.append(max(0.0, hist_sim))
            weights.append(0.4)

        # 2. 尺寸相似度 (权重: 0.2)
        if 'dimensions' in features1 and 'dimensions' in features2:
            w1, h1 = features1['dimensions']
            w2, h2 = features2['dimensions']
            size_diff = abs(w1 * h1 - w2 * h2) / max(w1 * h1, w2 * h2)
            size_sim = 1.0 - min(1.0, size_diff)
            scores.append(size_sim)
            weights.append(0.2)

        # 3. 宽高比相似度 (权重: 0.15)
        if 'aspect_ratio' in features1 and 'aspect_ratio' in features2:
            ratio_diff = abs(features1['aspect_ratio'] - features2['aspect_ratio'])
            ratio_sim = 1.0 - min(1.0, ratio_diff)
            scores.append(ratio_sim)
            weights.append(0.15)

        # 4. 颜色相似度 (权重: 0.15)
        if 'mean_color' in features1 and 'mean_color' in features2:
            color_diff = np.linalg.norm(features1['mean_color'] - features2['mean_color'])
            color_sim = 1.0 - min(1.0, color_diff / 441.67)  # 441.67 = sqrt(3) * 255
            scores.append(color_sim)
            weights.append(0.15)

        # 5. 边缘密度相似度 (权重: 0.1)
        if 'edge_density' in features1 and 'edge_density' in features2:
            edge_diff = abs(features1['edge_density'] - features2['edge_density'])
            edge_sim = 1.0 - min(1.0, edge_diff * 10)
            scores.append(edge_sim)
            weights.append(0.1)

        # 加权平均
        if scores and weights:
            weighted_score = sum(s * w for s, w in zip(scores, weights)) / sum(weights)
            return weighted_score

        return 0.0

    def find_clusters(self, image_paths: List[str]) -> Tuple[List[List[str]], List[str]]:
        """
        使用聚类算法找出相似图片组

        Returns:
            Tuple[有效聚类列表, 未分类图片列表]
        """
        print("正在计算图片特征...")

        # 计算所有图片的特征
        valid_images = []
        failed_count = 0

        for img_path in tqdm(image_paths, desc="提取特征"):
            features = self.calculate_image_features(img_path)
            if features is not None:
                self.image_features[img_path] = features
                valid_images.append(img_path)
            else:
                failed_count += 1

        print(f"成功提取 {len(valid_images)} 张图片的特征")
        if failed_count > 0:
            print(f"失败 {failed_count} 张图片")

        # 简单的贪心聚类算法
        all_clusters = []
        unprocessed = set(valid_images)

        print("正在进行相似度聚类...")
        pbar = tqdm(total=len(valid_images), desc="聚类进度")

        while unprocessed:
            # 选择一张未处理的图片作为聚类中心
            seed_image = next(iter(unprocessed))
            current_cluster = [seed_image]
            unprocessed.remove(seed_image)
            pbar.update(1)

            # 找出与该图片相似的所有图片
            seed_features = self.image_features[seed_image]
            to_remove = set()

            for img_path in unprocessed:
                similarity = self.calculate_similarity_score(
                    seed_features,
                    self.image_features[img_path]
                )

                if similarity >= self.similarity_threshold:
                    current_cluster.append(img_path)
                    to_remove.add(img_path)

            # 从未处理集合中移除已聚类的图片
            for img_path in to_remove:
                unprocessed.remove(img_path)
                pbar.update(1)

            all_clusters.append(current_cluster)

        pbar.close()

        # 分离有效聚类和未分类图片
        valid_clusters = [cluster for cluster in all_clusters if len(cluster) >= self.min_cluster_size]
        unclassified_images = []
        for cluster in all_clusters:
            if len(cluster) < self.min_cluster_size:
                unclassified_images.extend(cluster)

        # 按聚类大小排序（大的聚类在前）
        valid_clusters.sort(key=len, reverse=True)

        print(f"聚类完成:")
        print(f"  有效聚类组: {len(valid_clusters)} 个")
        print(f"  未分类图片: {len(unclassified_images)} 张")

        return valid_clusters, unclassified_images

    def create_output_folders(self, base_folder: str, valid_clusters: List[List[str]],
                             unclassified_images: List[str]) -> Dict:
        """
        创建输出文件夹结构
        """
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

        # 创建主分类文件夹
        classified_folder = os.path.join(base_folder, f"classified_{timestamp}")
        os.makedirs(classified_folder, exist_ok=True)

        folder_info = {
            'main_folder': classified_folder,
            'cluster_folders': [],
            'unclassified_folder': None
        }

        # 为每个有效聚类创建文件夹
        for i, cluster in enumerate(valid_clusters, 1):
            folder_name = f"similar_group_{i:03d}_{len(cluster)}pics"
            cluster_folder = os.path.join(classified_folder, folder_name)
            os.makedirs(cluster_folder, exist_ok=True)
            folder_info['cluster_folders'].append({
                'path': cluster_folder,
                'count': len(cluster),
                'images': cluster
            })

        # 为未分类图片创建文件夹
        if unclassified_images:
            unclassified_folder = os.path.join(classified_folder, f"notclassified_{len(unclassified_images)}pics")
            os.makedirs(unclassified_folder, exist_ok=True)
            folder_info['unclassified_folder'] = {
                'path': unclassified_folder,
                'count': len(unclassified_images),
                'images': unclassified_images
            }

        return folder_info

    def move_images_to_folders(self, valid_clusters: List[List[str]], unclassified_images: List[str],
                              folder_info: Dict) -> Dict:
        """
        将图片移动到对应的文件夹
        """
        move_log = {
            'total_moved': 0,
            'cluster_moves': [],
            'unclassified_moves': 0,
            'errors': []
        }

        print("正在移动图片到分类文件夹...")

        # 移动聚类图片
        for i, cluster in enumerate(tqdm(valid_clusters, desc="移动聚类图片")):
            target_folder = folder_info['cluster_folders'][i]['path']

            moved_count = 0
            for img_path in cluster:
                try:
                    filename = os.path.basename(img_path)
                    target_path = os.path.join(target_folder, filename)

                    # 如果目标文件已存在，添加序号
                    counter = 1
                    while os.path.exists(target_path):
                        name, ext = os.path.splitext(filename)
                        target_path = os.path.join(target_folder, f"{name}_{counter}{ext}")
                        counter += 1

                    shutil.move(img_path, target_path)
                    moved_count += 1
                    move_log['total_moved'] += 1
                except Exception as e:
                    move_log['errors'].append(f"移动失败: {img_path} -> {str(e)}")

            move_log['cluster_moves'].append({
                'folder': os.path.basename(target_folder),
                'moved_count': moved_count
            })

        # 移动未分类图片
        if folder_info['unclassified_folder'] and unclassified_images:
            target_folder = folder_info['unclassified_folder']['path']

            for img_path in tqdm(unclassified_images, desc="移动未分类图片"):
                try:
                    filename = os.path.basename(img_path)
                    target_path = os.path.join(target_folder, filename)

                    counter = 1
                    while os.path.exists(target_path):
                        name, ext = os.path.splitext(filename)
                        target_path = os.path.join(target_folder, f"{name}_{counter}{ext}")
                        counter += 1

                    shutil.move(img_path, target_path)
                    move_log['unclassified_moves'] += 1
                    move_log['total_moved'] += 1
                except Exception as e:
                    move_log['errors'].append(f"移动失败: {img_path} -> {str(e)}")

        return move_log

    def generate_report(self, base_folder: str, valid_clusters: List[List[str]],
                       unclassified_images: List[str], folder_info: Dict, move_log: Dict) -> str:
        """
        生成分类报告
        """
        timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

        # 统计信息
        total_images = sum(len(cluster) for cluster in valid_clusters) + len(unclassified_images)
        cluster_groups = len(valid_clusters)

        # 格式统计
        format_stats = {}
        all_images = []
        for cluster in valid_clusters:
            all_images.extend(cluster)
        all_images.extend(unclassified_images)

        for img_path in all_images:
            ext = Path(img_path).suffix.lower()
            format_stats[ext] = format_stats.get(ext, 0) + 1

        # 生成报告内容
        report_lines = [
            "=" * 60,
            "图片自动分类报告",
            "=" * 60,
            f"生成时间: {timestamp}",
            f"相似度阈值: {self.similarity_threshold}",
            f"最小聚类大小: {self.min_cluster_size} 张",
            f"HEIC支持: {'是' if HEIC_SUPPORT else '否'}",
            "",
            "分类统计:",
            f"  总图片数量: {total_images}",
            f"  有效相似组: {cluster_groups}",
            f"  未分类图片: {len(unclassified_images)}",
            f"  成功移动: {move_log['total_moved']}",
            f"  移动错误: {len(move_log['errors'])}",
            "",
            "图片格式统计:",
        ]

        for ext, count in sorted(format_stats.items()):
            report_lines.append(f"  {ext}: {count} 张")

        report_lines.extend([
            "",
            "详细分类结果:",
            "-" * 40
        ])

        # 聚类详情
        for i, info in enumerate(folder_info['cluster_folders'], 1):
            folder_name = os.path.basename(info['path'])
            report_lines.extend([
                f"相似组 {i}: {folder_name}",
                f"  图片数量: {info['count']}",
                f"  文件夹: {info['path']}",
                "  包含图片:"
            ])
            for img_path in info['images'][:10]:  # 只显示前10张
                # 显示相对路径，更简洁
                rel_path = os.path.relpath(img_path, base_folder)
                report_lines.append(f"    - {rel_path}")
            if len(info['images']) > 10:
                report_lines.append(f"    ... 还有 {len(info['images']) - 10} 张图片")
            report_lines.append("")

        # 未分类图片
        if folder_info['unclassified_folder']:
            report_lines.extend([
                f"未分类图片: {folder_info['unclassified_folder']['count']} 张",
                f"  文件夹: {folder_info['unclassified_folder']['path']}",
                f"  说明: 小于 {self.min_cluster_size} 张的相似组合并到此文件夹",
                ""
            ])

        # 移动统计
        if move_log['cluster_moves']:
            report_lines.extend([
                "移动统计:",
                "-" * 20
            ])
            for move_info in move_log['cluster_moves']:
                report_lines.append(f"  {move_info['folder']}: {move_info['moved_count']} 张")
            if move_log['unclassified_moves'] > 0:
                report_lines.append(f"  未分类文件夹: {move_log['unclassified_moves']} 张")
            report_lines.append("")

        # 错误信息
        if move_log['errors']:
            report_lines.extend([
                "移动错误:",
                "-" * 20
            ])
            for error in move_log['errors'][:20]:  # 只显示前20个错误
                report_lines.append(f"  {error}")
            if len(move_log['errors']) > 20:
                report_lines.append(f"  ... 还有 {len(move_log['errors']) - 20} 个错误")
            report_lines.append("")

        report_lines.extend([
            "分类规则说明:",
            f"- 相似度阈值: {self.similarity_threshold}",
            f"- 最小聚类大小: {self.min_cluster_size} 张",
            f"- 小于 {self.min_cluster_size} 张的相似组会合并到 'notclassified' 文件夹",
            f"- 支持的格式: {', '.join(sorted(self.supported_formats))}",
            "",
            "分类完成！",
            "=" * 60
        ])

        # 保存报告
        report_content = "\n".join(report_lines)
        report_file = os.path.join(base_folder, f"classification_report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt")

        with open(report_file, 'w', encoding='utf-8') as f:
            f.write(report_content)

        return report_file

    def classify_images(self, folder_path: str) -> str:
        """
        主要分类方法
        """
        print(f"开始分析文件夹: {folder_path}")
        print(f"支持的图片格式: {', '.join(sorted(self.supported_formats))}")

        # 获取所有图片文件（包括子目录）
        image_files = self.get_all_image_files(folder_path)

        if not image_files:
            print("未找到任何支持的图片文件")
            return None

        # 进行聚类
        valid_clusters, unclassified_images = self.find_clusters(image_files)

        # 创建输出文件夹
        folder_info = self.create_output_folders(folder_path, valid_clusters, unclassified_images)

        # 移动图片
        move_log = self.move_images_to_folders(valid_clusters, unclassified_images, folder_info)

        # 生成报告
        report_file = self.generate_report(folder_path, valid_clusters, unclassified_images,
                                         folder_info, move_log)

        print(f"\n分类完成！")
        print(f"分类结果保存在: {folder_info['main_folder']}")
        print(f"详细报告: {report_file}")

        return report_file


def main():
    """
    主函数
    """
    import argparse

    parser = argparse.ArgumentParser(description='自动图片相似度分类工具')
    parser.add_argument('folder', help='包含图片的文件夹路径（会递归搜索子目录）')
    parser.add_argument('--threshold', '-t', type=float, default=0.75,
                       help='相似度阈值 (0.0-1.0), 默认为0.75')
    parser.add_argument('--min-cluster-size', '-m', type=int, default=3,
                       help='最小聚类大小，小于此数量的会放入notclassified文件夹，默认为3')
    parser.add_argument('--dry-run', action='store_true',
                       help='仅分析不移动文件')

    args = parser.parse_args()

    if not os.path.exists(args.folder):
        print(f"错误: 文件夹 {args.folder} 不存在")
        return

    # 创建分类器
    classifier = ImageClusterClassifier(
        similarity_threshold=args.threshold,
        min_cluster_size=args.min_cluster_size
    )

    if args.dry_run:
        print("运行在预览模式，不会移动文件")
        # 可以在这里添加预览功能
        image_files = classifier.get_all_image_files(args.folder)
        if image_files:
            print(f"预览模式: 找到 {len(image_files)} 张图片")
    else:
        # 执行分类
        report_file = classifier.classify_images(args.folder)

        if report_file:
            print(f"\n报告文件已生成: {report_file}")


if __name__ == "__main__":
    main()