package MARC::Lint; use strict; use integer; eval 'use warnings' if $] >= 5.006; =head1 NAME MARC::Lint - Perl extension for checking validity of MARC records =head1 VERSION Version 1.10 $Id: Lint.pm 3430 2002-09-24 05:17:39Z jrm21 $ =cut use vars '$VERSION'; $VERSION = '1.10'; use MARC::Record; use MARC::Field; =head1 SYNOPSIS use MARC::Record; use MARC::Lint; my $linter = new MARC::Lint; my $filename = shift; open( IN, "<", $filename ) or die "Couldn't open $filename: $!\n"; binmode( IN ); # for the Windows folks while ( !eof(IN) ) { my $marc = MARC::Record::next_from_file( *IN ); die $MARC::Record::ERROR unless $marc; $linter->check_record( $marc ); # Print the title tag print $marc->subfield(245,"a"), "\n"; # Print the errors that were found print join( "\n", $linter->warnings ), "\n"; } # while close IN or die "Error closing $filename: $!\n"; Given the following MARC record: LDR 00000nam 22002538a 4500 100 14 _aWall, Larry. 110 1 _aO'Reilly & Associates. 245 90 _aProgramming Perl / _aBig Book of Perl / _cLarry Wall, Tom Christiansen & Jon Orwant. 250 _a3rd ed. 250 _a3rd ed. 260 _aCambridge, Mass. : _bO'Reilly, _r2000. 590 4 _aPersonally signed by Larry. 856 43 _uhttp://www.perl.com/ the following errors are generated: 1XX: Only one 1XX tag is allowed, but I found 2 of them. 100: Indicator 2 must be blank 245: Indicator 1 must be 0 or 1 245: Subfield _a is not repeatable. 250: Field is not repeatable. 260: Subfield _r is not valid. 260: Must have a subfield _c. 590: Indicator 1 must be blank 856: Indicator 2 must be blank, 0, 1, 2 or 8 =head1 DESCRIPTION Module for checking validity of MARC records. 99% of the users will want to do something like is shown in the synopsis. The other intrepid 1% will overload the C module's methods and provide their own special field-level checking. What this means is that if you have certain requirements, such as making sure that all 952 tags have a certain call number in them, you can write a function that checks for that, and still get all the benefits of the MARC::Lint framework. =head1 EXPORT None. Everything is done through objects. =head1 METHODS =head2 C No parms needed. The C object is little more than a list of warnings and a bunch of rules. =cut sub new() { my $class = shift; $class = ref($class) || $class; my $self = { _warnings => [], }; bless $self, $class; $self->_read_rules(); return $self; } =head2 C Returns a list of warnings found by C and its brethren. =cut sub warnings { my $self = shift; return wantarray ? @{$self->{_warnings}} : scalar @{$self->{_warnings}}; } =head2 C Clear the list of warnings for this linter object. It's automatically called when you call C. =cut sub clear_warnings { my $self = shift; $self->{_warnings} = []; } =head2 C Create a warning message, built from strings passed, like a C statement. Typically, you'll leave this to C, but industrious programmers may want to do their own checking as well. =cut sub warn { my $self = shift; push( @{$self->{_warnings}}, join( "", @_ ) ); return undef; } =head2 C Does all sorts of lint-like checks on the MARC record I, both on the record as a whole, and on the individual fields & subfields. =cut sub check_record { my $self = shift; my $marc = shift; $self->clear_warnings(); (ref($marc) eq "MARC::Record") or return $self->warn( "Must pass a MARC::Record object to check_record" ); if ( (my @_1xx = $marc->field( "1XX" )) > 1 ) { $self->warn( "1XX: Only one 1XX tag is allowed, but I found ", scalar @_1xx, " of them." ); } if ( not $marc->field( 245 ) ) { $self->warn( "245: No 245 tag." ); } my %field_seen; my $rules = $self->{_rules}; for my $field ( $marc->fields ) { my $tagno = $field->tag; my $tagrules = $rules->{$tagno} or next; if ( $tagrules->{NR} && $field_seen{$tagno} ) { $self->warn( "$tagno: Field is not repeatable." ); } if ( $tagno >= 10 ) { for my $ind ( 1..2 ) { my $indvalue = $field->indicator($ind); if ( not ($indvalue =~ $tagrules->{"ind$ind" . "_regex"}) ) { $self->warn( "$tagno: Indicator $ind must be ", $tagrules->{"ind$ind" . "_desc"}, " but it's \"$indvalue\"" ); } } my %sub_seen; for my $subfield ( $field->subfields ) { my ($code,$data) = @$subfield; my $rule = $tagrules->{$code}; if ( not defined $rule ) { $self->warn( "$tagno: Subfield _$code is not allowed." ); } elsif ( ($rule eq "NR") && $sub_seen{$code} ) { $self->warn( "$tagno: Subfield _$code is not repeatable." ); } if ( $data =~ /[\t\r\n]/ ) { $self->warn( "$tagno: Subfield _$code has an invalid control character" ); } ++$sub_seen{$code}; } } # Check to see if a check_xxx() function exists, and call it on the field if it does my $checker = "check_$tagno"; if ( $self->can( $checker ) ) { $self->$checker( $field ); } ++$field_seen{$tagno}; } # for return; } =head2 C(field)> Various functions to check the different fields. If the function doesn't exist, then it doesn't get checked. =cut sub check_245 { my $self = shift; my $field = shift; if ( not $field->subfield( "a" ) ) { $self->warn( "245: Must have a subfield _a." ); } } sub check_260 { my $self = shift; my $field = shift; if ( not $field->subfield( "c" ) ) { $self->warn( "260: Must have a subfield _c." ); } } =head1 SEE ALSO Check the docs for L. All software links are there. =head1 TODO =over 4 =item * ISBN and ISSN checking We can check the 020 and 022 fields with the C and C modules, respectively. =back =head1 LICENSE This code may be distributed under the same terms as Perl itself. Please note that these modules are not products of or supported by the employers of the various contributors to the code. =head1 AUTHOR Andy Lester, Emarc@petdance.comE or Ealester@flr.follett.comE =cut # Used only to read the stuff from __DATA__ sub _read_rules() { my $self = shift; my $tell = tell(DATA); # Stash the position so we can reset it for next time local $/ = ""; while ( my $lines = ) { $lines =~ s/\s+$//; my @keyvals = split( /\s+/, $lines ); my $tagno = shift @keyvals; my $repeatable = shift @keyvals; my @tag_range = ($tagno); if ( $tagno =~ /^(\d\d)X/ ) { my $base = $1; @tag_range = ( "${base}0" .. "${base}9" ); } # Handle the ranges of tags. for my $currtag ( @tag_range ) { $self->_parse_tag_rules( $currtag, $repeatable, @keyvals ); } # for # I guess I could just have multiple references to the same tag, but I'm not that worried about memory } # while seek( DATA, $tell, 0 ); } sub _parse_tag_rules { my $self = shift; my $tagno = shift; my $repeatable = shift; my @keyvals = @_; my $rules = ($self->{_rules}->{$tagno} ||= {}); $rules->{$repeatable} = $repeatable; while ( @keyvals ) { my $key = shift @keyvals; my $val = shift @keyvals; $rules->{$key} = $val; # Do magic for indicators if ( $key =~ /^ind/ ) { my $desc; my $regex; if ( $val eq "blank" ) { $desc = "blank"; $regex = qr/^ $/; } else { $desc = _nice_list($val); $val =~ s/^b/ /; $regex = qr/^[$val]$/; } $rules->{$key."_desc"} = $desc; $rules->{$key."_regex"} = $regex; } # if indicator } # while } sub _nice_list($) { my $str = shift; if ( $str =~ s/(\d)-(\d)/$1 thru $2/ ) { return $str; } my @digits = split( //, $str ); $digits[0] = "blank" if $digits[0] eq "b"; my $last = pop @digits; return join( ", ", @digits ) . " or $last"; } sub _ind_regex($) { my $str = shift; return qr/^ $/ if $str eq "blank"; return qr/^[$str]$/; } 1; __DATA__ 010 NR ind1 blank ind2 blank a NR z NR 016 R ind1 b7 ind2 blank a NR z R 2 NR 020 R ind1 blank ind2 blank a R c R 022 R ind1 blank ind2 blank a NR 040 NR ind1 blank ind2 blank a NR c NR d R 100 NR ind1 013 ind2 blank a NR q NR b R c R d NR e R 110 NR ind1 012 ind2 blank a NR b R 111 NR ind1 012 ind2 blank a NR n R d NR c NR e R 130 NR ind1 0-9 ind2 blank a NR n R p R h NR l NR k R s NR f NR 240 NR ind1 01 ind2 0-9 245 NR ind1 01 ind2 0-9 a NR n R p R h NR b NR s NR c NR 246 NR ind1 0123 ind2 012345678 a NR h NR b NR n R p R i NR f NR 250 NR ind1 blank ind2 blank a NR b NR 260 NR ind1 blank ind2 blank a R b R c R 300 NR ind1 blank ind2 blank a R b NR c R e NR 440 R ind1 blank ind2 0-9 a NR n R p R v NR 490 R ind1 01 ind2 blank a R v R 500 R ind1 blank ind2 blank a NR 504 R ind1 blank ind2 blank a NR 505 R ind1 0128 ind2 b0 a NR g R r R t R 520 R ind1 b018 ind2 blank a R b R 521 R ind1 b012348 ind2 blank a R b NR 526 R ind1 08 ind2 blank a NR b NR c NR d NR i NR x R z R 538 R ind1 blank ind2 blank a NR 546 R ind1 blank ind2 blank a NR 586 R ind1 b8 ind2 blank a NR 59X R ind1 blank ind2 blank a NR 600 R ind1 013 ind2 012567 a NR q NR b R c R d NR t NR v R x R y R z R 2 NR 610 R ind1 012 ind2 012567 a NR b R t NR v R x R y R z R 2 NR 611 R ind1 012 ind2 012567 a NR n R d NR c NR v R x R y R z R 2 NR 630 R ind1 0-9 ind2 012567 a NR n R p R l NR k R s NR f NR v R x R y R z R 2 NR 650 R ind1 blank ind2 012567 a NR v R x R y R z R 2 NR 651 R ind1 blank ind2 012567 a NR v R x R y R z R 2 NR 655 R ind1 blank ind2 7 a NR v R x R y R z R 2 NR 658 R ind1 blank ind2 blank a NR b R c NR d NR 2 NR 69X R ind1 blank ind2 blank a NR v R x R y R z R 700 R ind1 013 ind2 b2 a NR q NR b R c R d NR k R t NR e R f NR 710 R ind1 012 ind2 b2 a NR b R e R t NR 711 R ind1 012 ind2 b2 a NR n R d NR c NR t NR e R 730 R ind1 0-9 ind2 b2 a NR n R p R h NR l NR k R s NR f NR 740 R ind1 0-9 ind2 b2 a NR h NR n R p R 800 R ind1 013 ind2 blank a NR q NR b NR c R d NR t NR e R v NR 852 R ind1 b01234568 ind2 b012 a NR b R h NR i R k NR m NR t NR p NR 9 NR 856 R ind1 b012347 ind2 b0128 a R b NR d R f R h NR i R u R x R z R 2 NR 3 NR