FASTQ Processing

Processing FASTQ files for barcode extraction and classification

FASTQ File Handler


source

FastqHandler

 FastqHandler ()

Handles FASTQ file operations.

FASTQ Output Manager


source

FastqOutputManager

 FastqOutputManager (output_prefix:str, output_dir:str,
                     categories:List[str], compress:bool=True,
                     write_output:bool=True)

Manages output FASTQ files for different barcode categories.

FASTQ Processing Functions


source

prepare_fastq_categories

 prepare_fastq_categories
                           (barcodes:List[BarcodeSeqKit.core.BarcodeConfig
                           ])

*Prepare output categories based on barcodes.

Args: barcodes: List of barcode configurations

Returns: List of category names*


source

process_fastq_files

 process_fastq_files (config:BarcodeSeqKit.core.BarcodeExtractorConfig,
                      fastq_files:List[str], compress_output:bool=True,
                      search_both_reads:bool=True)

*Process FASTQ files to extract barcodes.

Args: config: Barcode extractor configuration fastq_files: List of FASTQ files (either 1 or 2 files) compress_output: Whether to compress output files search_both_reads: Whether to search for barcodes in both reads of paired-end data

Returns: Statistics from the extraction process*


source

save_statistics

 save_statistics (stats:BarcodeSeqKit.core.ExtractionStatistics,
                  output_prefix:str, output_dir:str)

*Save extraction statistics to files.

Args: stats: Extraction statistics output_prefix: Prefix for output files output_dir: Directory for output files*

# Example usage with test FASTQ files
from BarcodeSeqKit.core import BarcodeConfig, BarcodeLocationType, BarcodeExtractorConfig

# Define barcodes to search for
barcodes = [
    BarcodeConfig(
        sequence="TAACTGAGGCCGGC",  # 3' barcode 
        location=BarcodeLocationType.THREE_PRIME,
        name="3prime",
        description="3' barcode from test data"
    ),
    BarcodeConfig(
        sequence="CTGACTCCTTAAGGGCC",  # 5' barcode
        location=BarcodeLocationType.FIVE_PRIME,
        name="5prime",
        description="5' barcode from test data"
    )
]

# Create a configuration
output_dir = "../tests/fastq_output"
os.makedirs(output_dir, exist_ok=True)

config = BarcodeExtractorConfig(
    barcodes=barcodes,
    output_prefix="test_extraction",
    output_dir=output_dir,
    max_mismatches=0,
    verbose=True
)

# Path to test FASTQ files
fastq_dir = "../tests"
test_fastq1 = os.path.join(fastq_dir, "test.1.fastq.gz")
test_fastq2 = os.path.join(fastq_dir, "test.2.fastq.gz")

# Check if test files exist
if os.path.exists(test_fastq1) and os.path.exists(test_fastq2):
    print(f"Processing FASTQ files: {test_fastq1}, {test_fastq2}")
    
    # Process the FASTQ files
    stats = process_fastq_files(
        config=config,
        fastq_files=[test_fastq1, test_fastq2],
        compress_output=True,
        search_both_reads=True
    )
    
    # Print results
    print(f"\nTotal reads: {stats.total_reads}")
    print(f"Total barcode matches: {stats.total_barcode_matches}")
    
    for barcode_name, count in stats.matches_by_barcode.items():
        print(f"  {barcode_name}: {count} matches")
    
    for orientation, count in stats.matches_by_orientation.items():
        print(f"  Orientation {orientation}: {count} matches")
    
    for category, count in stats.matches_by_category.items():
        print(f"  Category {category}: {count} matches")
    
    # List the output files
    output_files = [f for f in os.listdir(output_dir) if f.startswith("test_extraction_") and f.endswith((".fastq.gz", ".fastq"))]
    print("\nOutput files:")
    for f in output_files:
        path = os.path.join(output_dir, f)
        if f.endswith(".fastq.gz"):
            try:
                # Count FASTQ reads
                read_count = FastqHandler.count_fastq_reads(path)
                print(f"  {f} ({read_count} reads)")
            except Exception as e:
                # Fallback to file size
                size = os.path.getsize(path)
                print(f"  {f} ({size} bytes) - Error getting read count: {str(e)}")
        else:
            size = os.path.getsize(path)
            print(f"  {f} ({size} bytes)")
else:
    print(f"Test files not found. Expected: {test_fastq1} and {test_fastq2}")
Test files not found. Expected: ../tests/test.1.fastq.gz and ../tests/test.2.fastq.gz