#!/usr/bin/perl ############################################################# # repairWTgedexport.pl # Repair WikiTree Gedcom Export # # Dennis Wheeler # Feb 2017 # # The WikiTree gedcom export is broken. It strictly cuts off # long text lines at 78 characters (counting the CONC tag) # without regard to character boundries (thus splitting utf-8 # characters with a newline and CONC tag in the middle) # # see discussion: # https://www.wikitree.com/g2g/348490/is-it-ok-that-gedcom-export-splits-utf-8-characters # # This script repairs the resulting files by finding the # lines with invaild (split) utf-8 characters, by # stripping the intermediate characters and concatenating # the two lines back together into a single line. # # Since the line length limit for the gedcom CONT tag # is 256 characters, merging two lines together will # not break importing back into other applications. # # limitations: # The script assumes there will be only two consecutive lines # affected. But its possible there could be three or more. Will # need to revist if there are more than two consecutive lines. # It also assumes the second line will be a CONC tagged line. # # Outputs to stdout # # Usage: # ./repairWTgedexport.pl --check gedcom.ged # ./repairWTgedexport.pl --view gedcom.ged # ./repairWTgedexport.pl --fix gedcom.ged > newfile.ged # ############################################################# use utf8; use Getopt::Long qw(GetOptions); # declare the perl command line flags/options we want to allow my $check; my $view; my $fix; my $badgedcom; GetOptions( 'check=s' => \$check, 'view=s' => \$view, 'fix=s' => \$fix ); ($check || $view || $fix) || die "Usage: $0 --check --view --fix filename\n"; $badgedcom = $check if $check; $badgedcom = $view if $view; $badgedcom = $fix if $fix; my $prevline; open my $info, $badgedcom or die "Could not open $badgedcom: $!"; my $badlines = 0; while( my $line = <$info>) { # don't let perl change the encoding of $line my $testline = $line; chomp $line; # assumes only two consecutive lines are broken # if not valid utf8, then save this line, and concatenate with the next unless (utf8::decode($testline)) { # debugging, print invalid lines with line numbers print "*** INVALID:$.: $line\n" if $view; $badlines++; if ($prevline != "") { # second time through the loop, we've saved the previous line # assume second line is a CONC tag # stript the CONC tag before concatenating $line =~ s/[0-9] CONC //g; $line = $prevline.$line; undef $prevline; } else { # first time through the loop. save this line to go with the next # read the next line without printing $prevline = $line; next } } # output finished lines print $line."\n" if $fix; } close $info; if ($check) { print "This gedcom file, $badgedcom, has $badlines invalid utf-8 lines\n"; }