configs/data_juicer_recipes/redpajama-book-refine.yaml

# global parameters
project_name: 'Data-Juicer-recipes-book'
dataset_path: '/path/to/your/dataset'  # path to your dataset directory or file
export_path: '/path/to/your/dataset.jsonl'

np: 50  # number of subprocess to process your dataset
open_tracer: true

# process schedule
# a list of several process operators with their arguments
process:
  - clean_email_mapper:
  - clean_links_mapper:
  - fix_unicode_mapper:
  - punctuation_normalization_mapper:
  - whitespace_normalization_mapper:

  - alphanumeric_filter:
      tokenization: false
      min_ratio: 0.55  # <3sigma (0.697)
      max_ratio: 0.854  # 3sigma
  - average_line_length_filter:  # for code
      max_len: 500  # >3sigma (364)
  - character_repetition_filter:
      rep_len: 10
      max_ratio: 0.2  # >3sigma (0.12)
  - flagged_words_filter:
      lang: en
      tokenization: true
      max_ratio: 0.00047  # 3sigma
  - language_id_score_filter:  # remove language filter
      min_score: 0.2
  - maximum_line_length_filter:  # for code
      max_len: 13381  # 3sigma
  - perplexity_filter:
      lang: en
      max_ppl: 6000  # <3sigma (16516)
  - special_characters_filter:
      max_ratio: 0.5  # >3sigma (0.32)
  - words_num_filter:
      lang: en
      tokenization: true
      min_num: 1000
      max_num: 539754  # 3sigma
  - word_repetition_filter:
      lang: en
      tokenization: true
      rep_len: 10
      max_ratio: 0.194  # 3sigma

  - document_simhash_deduplicator:
      tokenization: space
      window_size: 6
      lowercase: true
      ignore_pattern: '\p{P}'
      num_blocks: 6
      hamming_distance: 4