#!/usr/bin/perl
# refresh-locale-codes -- Refresh the locale (aka ISO 639-1/639-2 codes)

# Copyright © 2013 Niels Thykier <niels@thykier.net>
# Copyright © 2019 Adam D. Barratt <adam@adam-barratt.org.uk>
# Based on a shell script, which was:
#   Copyright © 2010 Raphael Geissert <atomo64@gmail.com>
#
# This file is free software: you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 2 of the License, or
# (at your option) any later version.
#
# This file is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this file.  If not, see <http://www.gnu.org/licenses/>.

use v5.20;
use warnings;
use utf8;

use Cwd qw(realpath);
use File::Basename qw(dirname);
use Unicode::UTF8 qw(encode_utf8);

# neither Path::This nor lib::relative are in Debian
use constant THISFILE => realpath __FILE__;
use constant THISDIR => dirname realpath __FILE__;

# use Lintian modules that belong to this program
use lib THISDIR . '/../lib';

use IPC::Run3;
use POSIX qw(strftime);

use Lintian::Util qw(locate_executable);

my ($DATADIR) = @ARGV;
my (%CODES, $outfile);

die encode_utf8("Usage: $0 <path-to-data-dir>\n")
  unless defined $DATADIR and -d $DATADIR;

check_requirements();

my $date = strftime('%Y-%m-%d', gmtime);

$ENV{LC_ALL} = 'C';

parse_iso_query();
parse_iso_xml();

$outfile = "$DATADIR/files/locale-codes.new";

open(my $out, '>', $outfile)
  or die encode_utf8("Cannot open $outfile");

print {$out} encode_utf8(<<"EOF");
# List of locale codes.  This is derived from the ISO 639-1, ISO
# 639-2, and ISO 639-3 standards.
# If a language has 639-1 and 639-2 codes, the -1 code is also included
# as a key to be mapped to the -2 code.
#
# Last updated: $date

EOF

foreach my $code (sort keys %CODES) {
    my $alt = $CODES{$code};
    print {$out} encode_utf8($code);
    print {$out} encode_utf8(" $alt") if defined $alt;
    print {$out} encode_utf8("\n");
}

close($out);

rename $outfile, "$DATADIR/files/locale-codes"
  or die encode_utf8("rename $outfile -> $DATADIR/files/locale-codes: $!");

exit 0;

sub parse_iso_xml {
    my $iso_xml = '/usr/share/xml/iso-codes/iso_639_3.xml';
    open(my $fd, '<', $iso_xml)
      or die encode_utf8("Cannot open $iso_xml");

    local $/ = '/>';
    while (my $line = <$fd>) {
        my $special = 0;

        # skip it if it is a "special" isotype (#692548, comment #10).  However
        # sometimes we "collect" these from iso-query.  If so, we have to
        # prune them from %CODES.
        $special = 1
          if $line =~ /\<iso_639_3_entry [^\>]* \btype=[\'\"]S?[\'\"]/x;

        # Extract the id of the entry.  We match the start of the tag
        # again to ensure we catch the id inside the tag.  (Our input
        # separator causes us to consume a lot of leading "stuff"
        # prior to the first entry being closed).
        next
          unless $line
          =~ /\<iso_639_3_entry [^\>]* \bid=[\'\"]([^\'\"]+)[\'\"]/x;

        my $id = lc $1;

        if ($special) {
            delete $CODES{$id};
        } else {
            $CODES{$id} = undef
              unless exists $CODES{$id};
        }
    }

    close($fd);
    return;
}

sub parse_iso_query {
    my @command = qw{isoquery -i 639-2};
    my $output;

    run3(\@command, \undef, \$output);
    my @lines = split(/\n/, $output);

    while (defined(my $line = shift @lines)) {

        next
          unless $line =~ /^(\S{3})\s+(?:\S{3}\s+)?(?:(\S{2})\s+)?/;
        my ($iso1, $iso2) = ($2, $1);
        next
          if $iso2 eq 'zxx';
        $iso2 = lc $iso2;

        $CODES{$iso2} = undef
          unless exists $CODES{$iso2};

        if (defined $iso1) {
            $iso1 = lc $iso1;
            $CODES{lc $iso2} = $iso1;
            $CODES{$iso1} = undef
              unless exists $CODES{$iso1};
        }
    }

    return;
}

sub check_requirements {
    my @missing;
    push @missing, 'isoquery in PATH'
      unless length locate_executable('isoquery');
    push @missing, 'The file /usr/share/xml/iso-codes/iso_639_3.xml'
      unless -e '/usr/share/xml/iso-codes/iso_639_3.xml';

    return unless @missing;

    print {*STDERR} encode_utf8("Missing requirements:\n");
    print {*STDERR} encode_utf8("\t", join("\n\t", @missing), "\n");
    exit 1;
}

# Local Variables:
# indent-tabs-mode: nil
# cperl-indent-level: 4
# End:
# vim: syntax=perl sw=4 sts=4 sr et
