Translation Notebook
Download ipynb file here.
Basic installation needs:
!pip install deep-translator
!pip install PyPDF2
# code below is only if you are using Google Colab
from google.colab import drive
drive.mount('/content/drive')
Convert pdf into plain text (optional):
from PyPDF2 import PdfReader
# Function to extract text from a PDF file
def extract_text_from_pdf(pdf_path):
reader = PdfReader(pdf_path)
text = ""
for page_num, page in enumerate(reader.pages, start=1):
page_text = page.extract_text()
if page_text: # Check if text is extracted
text += page_text
else:
print(f"Warning: No text found on page {page_num}")
return text
# Define the path to your PDF file and output text file
pdf_path = 'path/to/input/pdf/file.pdf'
output_file = 'path/to/pdf/output.txt'
# Extract text from the PDF
pdf_text = extract_text_from_pdf(pdf_path)
# Save the extracted text to a .txt file
with open(output_file, 'w', encoding='utf-8') as file:
file.write(pdf_text)
print("📄Text extraction complete!📄")
Split text into manageable chunks:
def split_text_file(input_file):
# You can change the max_length parameter to split the file differently
max_length = 4000
with open(input_file, 'r', encoding='utf-8') as file:
text = file.read()
chunks = [text[i:i+max_length] for i in range(0, len(text), max_length)]
# Use os.path.splitext to split the filename and extension properly
file_base_name = os.path.splitext(input_file)[0]
for index, chunk in enumerate(chunks):
output_file_name = f"{file_base_name}_{index + 1}.txt"
with open(output_file_name, 'w', encoding='utf-8') as file:
file.write(chunk)
# Replace 'file.txt' with the path to your .txt file
split_text_file('path/to/input/text/file.txt')
print(f"✂️Successfully created TXT files!✂️")
The translation itself:
import os
from deep_translator import GoogleTranslator
# Set up the translator: French to English
translator = GoogleTranslator(source='fr', target='en')
# Define the folder containing the .txt files
input_folder = 'path/to/input/folder'
output_folder = 'path/to/output/folder'
# Ensure the output folder exists
if not os.path.exists(output_folder):
os.makedirs(output_folder)
# Function to translate text from a file
def translate_file(input_file, output_file):
with open(input_file, 'r', encoding='utf-8') as file:
text = file.read()
# Translate the text
translated_text = translator.translate(text)
# Write the translated text to a new file
with open(output_file, 'w', encoding='utf-8') as file:
file.write(translated_text)
# Loop through all files in the input folder
for filename in os.listdir(input_folder):
if filename.endswith('.txt'):
input_file = os.path.join(input_folder, filename)
output_file = os.path.join(output_folder, f'translated_{filename}')
# Translate each file
translate_file(input_file, output_file)
print("📝Translation complete!📝")
Concatenate the results into a single txt file (optional)
# Combining translated text into a single file
# Define the folder containing the .txt files
input_folder = 'path/to/input/folder'
output_file = 'path/to/output/file.txt'
# Open the output file in write mode
with open(output_file, 'w', encoding='utf-8') as outfile:
# Loop through all files in the input folder
for filename in os.listdir(input_folder):
if filename.endswith('.txt'):
input_file = os.path.join(input_folder, filename)
# Open each file and append its content to the output file
with open(input_file, 'r', encoding='utf-8') as infile:
outfile.write(infile.read())
outfile.write("\n") # Add a newline between files
print("📚Concatenation complete!📚")