Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .golangci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@ linters:
path: chardet\.go

settings:
gomoddirectives:
replace-allow-list:
- github.com/kdomanski/iso9660
gosec:
excludes:
- G304
Expand Down
25 changes: 23 additions & 2 deletions files.go
Original file line number Diff line number Diff line change
Expand Up @@ -352,11 +352,32 @@ func ExtractFile(xFile *XFile) (size uint64, filesList, archiveList []string, er

for _, ext := range extension2function {
if strings.HasSuffix(sName, ext.Extension) {
return ext.Extract(xFile)
size, filesList, archiveList, err = ext.Extract(xFile)
if err == nil {
return size, filesList, archiveList, nil
}

// Extension matched but extraction failed; try signature detection as fallback.
break
}
}

return 0, nil, nil, fmt.Errorf("%w: %s", ErrUnknownArchiveType, xFile.FilePath)
// Fall back to file signature (magic number) detection.
xFile.Debugf("falling back to signature detection for %s (extension error: %v)", xFile.FilePath, err)

extractFn, sigErr := detectBySignature(xFile.FilePath)
if sigErr != nil {
extErr := &ExtractError{}
if err != nil {
extErr.Errs = append(extErr.Errs, err)
}

extErr.Errs = append(extErr.Errs, sigErr)

return 0, nil, nil, extErr
}

return extractFn(xFile)
}

// MoveFiles relocates files then removes the folder they were in.
Expand Down
108 changes: 108 additions & 0 deletions magic.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
package xtractr

/* Code to detect archive types by file signatures (magic numbers). */

import (
"bytes"
"fmt"
"os"
)

// signature maps a byte pattern at a specific offset to an extract function.
type signature struct {
// Offset is the byte offset where the magic bytes are expected.
Offset int
// Magic is the byte sequence to match at Offset.
Magic []byte
// Extract function for this signature.
Extract Interface
}

// maxSignatureRead is the maximum number of bytes to read for signature detection.
// This is enough for ISO9660 detection at offset 0x9001 + 5 bytes for "CD001".
const maxSignatureRead = 0x9006

// signatureTable maps file signatures (magic numbers) to their corresponding extract functions.
//
//nolint:gochecknoglobals
var signatureTable = []signature{
// RAR v5 (longer match first).
{Offset: 0, Magic: []byte{0x52, 0x61, 0x72, 0x21, 0x1A, 0x07, 0x01, 0x00}, Extract: ExtractRAR},
// RAR v4.
{Offset: 0, Magic: []byte{0x52, 0x61, 0x72, 0x21, 0x1A, 0x07, 0x00}, Extract: ExtractRAR},
// 7-Zip.
{Offset: 0, Magic: []byte{0x37, 0x7A, 0xBC, 0xAF, 0x27, 0x1C}, Extract: Extract7z},
// ZIP (PK\x03\x04).
{Offset: 0, Magic: []byte{0x50, 0x4B, 0x03, 0x04}, Extract: ChngInt(ExtractZIP)},
// Gzip.
{Offset: 0, Magic: []byte{0x1F, 0x8B}, Extract: ChngInt(ExtractGzip)},
// Bzip2 (BZh).
{Offset: 0, Magic: []byte{0x42, 0x5A, 0x68}, Extract: ChngInt(ExtractBzip)},
// XZ.
{Offset: 0, Magic: []byte{0xFD, 0x37, 0x7A, 0x58, 0x5A, 0x00}, Extract: ChngInt(ExtractXZ)},
// Zstandard.
{Offset: 0, Magic: []byte{0x28, 0xB5, 0x2F, 0xFD}, Extract: ChngInt(ExtractZstandard)},
// LZ4.
{Offset: 0, Magic: []byte{0x04, 0x22, 0x4D, 0x18}, Extract: ChngInt(ExtractLZ4)},
// LZMA.
{Offset: 0, Magic: []byte{0x5D, 0x00, 0x00}, Extract: ChngInt(ExtractLZMA)},
// Brotli.
{Offset: 0, Magic: []byte{0xCE, 0xB2, 0xCF, 0x81}, Extract: ChngInt(ExtractBrotli)},
// AR / DEB ("!<arch>\n").
{Offset: 0, Magic: []byte{0x21, 0x3C, 0x61, 0x72, 0x63, 0x68, 0x3E, 0x0A}, Extract: ChngInt(ExtractAr)},
// RPM.
{Offset: 0, Magic: []byte{0xED, 0xAB, 0xEE, 0xDB}, Extract: ChngInt(ExtractRPM)},
// ISO9660 at offset 0x8001.
{Offset: 0x8001, Magic: []byte{0x43, 0x44, 0x30, 0x30, 0x31}, Extract: ChngInt(ExtractISO)}, //nolint:mnd
// ISO9660 at offset 0x8801.
{Offset: 0x8801, Magic: []byte{0x43, 0x44, 0x30, 0x30, 0x31}, Extract: ChngInt(ExtractISO)}, //nolint:mnd
// ISO9660 at offset 0x9001.
{Offset: 0x9001, Magic: []byte{0x43, 0x44, 0x30, 0x30, 0x31}, Extract: ChngInt(ExtractISO)}, //nolint:mnd
}
Comment thread
davidnewhall marked this conversation as resolved.

// detectBySignature reads the first bytes of a file and attempts to match
// a known file signature (magic number) to determine the archive type.
func detectBySignature(filePath string) (Interface, error) {
file, err := os.Open(filePath)
if err != nil {
return nil, fmt.Errorf("opening file for signature detection: %w", err)
}
defer file.Close()

stat, err := file.Stat()
if err != nil {
return nil, fmt.Errorf("stat file for signature detection: %w", err)
}

readSize := min(stat.Size(), int64(maxSignatureRead))

buf := make([]byte, readSize)

n, err := file.Read(buf)
if err != nil {
return nil, fmt.Errorf("reading file for signature detection: %w", err)
}

buf = buf[:n]

for _, sig := range signatureTable {
end := sig.Offset + len(sig.Magic)
if end > len(buf) {
continue
}

if bytes.Equal(buf[sig.Offset:end], sig.Magic) {
return sig.Extract, nil
}
}

return nil, fmt.Errorf("%w: %s", ErrUnknownArchiveType, filePath)
}

// IsArchiveFileByContent returns true if the provided file path contains
// a recognized archive file signature. Unlike IsArchiveFile, this reads
// the actual file content rather than relying on the file extension.
func IsArchiveFileByContent(path string) bool {
extractFn, err := detectBySignature(path)
return err == nil && extractFn != nil
}
Loading
Loading